
    rh              	          d Z ddlZddlZddlZddlmZ ddlmZm	Z	 ddl
Z
ddlZ
ddl
mZ ddlmZmZmZ ddlmZ dd	lmZ dd
lmZmZmZmZ ddlmZ ddlmZmZmZ ddl m!Z!m"Z"m#Z# ddl$m%Z%  e"jL                  e'      Z(e e!d       G d de                    Z)dHde
jT                  de+de,de
jT                  fdZ- G d dej\                        Z/ G d dej\                        Z0 G d dej\                        Z1 G d d ej\                        Z2 G d! d"e2      Z3 G d# d$ej\                        Z4e2e3d%Z5 G d& d'ej\                        Z6 G d( d)ej\                        Z7 G d* d+ej\                        Z8 G d, d-e      Z9 G d. d/ej\                        Z: G d0 d1ej\                        Z;e! G d2 d3e             Z<e! G d4 d5e<             Z= G d6 d7ej\                        Z> e!d8       G d9 d:e<             Z? G d; d<ej\                        Z@ G d= d>ej\                        ZA G d? d@ej\                        ZB G dA dBej\                        ZC G dC dDej\                        ZDe! G dE dFe<             ZEg dGZFy)IzPyTorch Data2VecVision model.    N)	dataclass)OptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPoolingImageClassifierOutputSemanticSegmenterOutput)PreTrainedModel)#compile_compatible_method_lru_cache find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringlogging	torch_int   )Data2VecVisionConfigz7
    Class for outputs of [`Data2VecVisionModel`].
    )custom_introc                       e Zd ZdZy)$Data2VecVisionModelOutputWithPoolingaF  
    pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
        Average of the last layer hidden states of the patch tokens (excluding the *[CLS]* token) if
        *config.use_mean_pooling* is set to True. If set to False, then the final hidden state of the *[CLS]* token
        will be returned.
    N)__name__
__module____qualname____doc__     /var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/data2vec/modeling_data2vec_vision.pyr   r   -   s    r"   r   input	drop_probtrainingreturnc                    |dk(  s|s| S d|z
  }| j                   d   fd| j                  dz
  z  z   }|t        j                  || j                  | j
                        z   }|j                          | j                  |      |z  }|S )aF  
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
            r   r   )r   )dtypedevice)shapendimtorchrandr*   r+   floor_div)r$   r%   r&   	keep_probr,   random_tensoroutputs          r#   	drop_pathr5   >   s     CxII[[^

Q 77E

5ELL YYMYYy!M1FMr"   c                   x     e Zd ZdZd	dee   ddf fdZdej                  dej                  fdZ	de
fdZ xZS )
Data2VecVisionDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr%   r'   c                 0    t         |           || _        y N)super__init__r%   )selfr%   	__class__s     r#   r;   zData2VecVisionDropPath.__init__V   s    "r"   hidden_statesc                 D    t        || j                  | j                        S r9   )r5   r%   r&   r<   r>   s     r#   forwardzData2VecVisionDropPath.forwardZ   s    FFr"   c                      d| j                    S )Nzp=)r%   r<   s    r#   
extra_reprz!Data2VecVisionDropPath.extra_repr]   s    DNN#$$r"   r9   )r   r   r   r    r   floatr;   r.   TensorrA   strrD   __classcell__r=   s   @r#   r7   r7   S   sG    b#(5/ #T #GU\\ Gell G%C %r"   r7   c            	            e Zd ZdZdeddf fdZdej                  dededej                  fd	Z		 	 dd
ej                  de
ej                     de
e   dej                  fdZ xZS )Data2VecVisionEmbeddingszc
    Construct the CLS token, position and patch embeddings. Optionally, also the mask token.

    configr'   Nc                 2   t         |           t        j                  t	        j
                  dd|j                              | _        |j                  r:t        j                  t	        j
                  dd|j                              | _	        nd | _	        t        |      | _        |j                  | _        t        |j                  t        j                   j"                        r|j                  n|j                  |j                  f| _        | j                  j$                  }|j&                  r=t        j                  t	        j
                  d|dz   |j                              | _        nd | _        t        j*                  |j,                        | _        y )Nr   )r:   r;   r   	Parameterr.   zeroshidden_size	cls_tokenuse_mask_token
mask_tokenData2VecVisionPatchEmbeddingspatch_embeddings
patch_size
isinstance
image_sizecollectionsabcIterablenum_patches use_absolute_position_embeddingsposition_embeddingsDropouthidden_dropout_probdropout)r<   rL   r\   r=   s      r#   r;   z!Data2VecVisionEmbeddings.__init__h   s$   ekk!Q8J8J&KL   ll5;;q!V=O=O+PQDO"DO =f E ++ &++[__-E-EF ##V%6%67 	
 ++7722')||EKK;QR?TZTfTf4g'hD$'+D$zz&"<"<=r"   
embeddingsheightwidthc                    |j                   d   dz
  }| j                  j                   d   dz
  }t        j                  j	                         s||k(  r||k(  r| j                  S | j                  ddddf   }| j                  ddddf   }|j                   d   }|| j
                  z  }	|| j
                  z  }
t        |dz        }|j                  d|||      }|j                  dddd      }t        j                  j                  ||	|
fdd	
      }|j                  dddd      j                  dd|      }t        j                  ||fd      S )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   Ng      ?r   r
      bicubicFsizemodealign_cornersdim)r,   r^   r.   jit
is_tracingrV   r   reshapepermuter   
functionalinterpolateviewcat)r<   rb   rc   rd   r\   num_positionsclass_pos_embedpatch_pos_embedrn   
new_height	new_widthsqrt_num_positionss               r#   interpolate_pos_encodingz1Data2VecVisionEmbeddings.interpolate_pos_encoding   s`    !&&q)A-0066q9A= yy##%+*F6UZ?+++221bqb59221ab59r"t.
T__,	&}c'9:)11!5GI[]`a)11!Q1=--33i(	 4 
 *11!Q1=BB1b#Nyy/?;CCr"   pixel_valuesbool_masked_posr}   c                 8   | j                   |t        j                  d       |j                  \  }}}}| j	                  |      \  }\  }}	|j                         \  }
}}|K| j                  j                  |
|d      }|j                  d      j                  |      }|d|z
  z  ||z  z   }| j                  j                  |
dd      }t        j                  ||fd      }| j                   || j                  |||      z   }| j                  |      }|||	ffS )Nz`interpolate_pos_encoding` argument has no effect for BEiTEmbeddings, embeddings are always interpolated to the input image size. The argument will be removed in transformers v4.51.0.rf   r   rm   )r^   warningswarnr,   rU   rj   rS   expand	unsqueezetype_asrQ   r.   rv   r}   ra   )r<   r~   r   r}   _rc   rd   rb   patch_heightpatch_width
batch_sizeseq_lenmask_tokensw
cls_tokenss                  r#   rA   z Data2VecVisionEmbeddings.forward   s-    ##/4L4XMMn
 +001fe262G2G2U/
/\;!+!2
GQ&//00WbIK))"-55kBA#q1u-a?J^^**:r2>
YY
J7Q?
##/#d&C&CJPVX]&^^J\\*-
L+666r"   NN)r   r   r   r    r   r;   r.   rF   intr}   r   
BoolTensorboolrA   rH   rI   s   @r#   rK   rK   b   s    
>3 > >.&D5<< &D &DUX &D]b]i]i &DV 7;37	7ll7 "%"2"237 #+4.	7
 
7r"   rK   c                   Z     e Zd ZdZ fdZdej                  dej                  fdZ xZS )rT   z
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                    t         |           |j                  |j                  }}|j                  |j
                  }}t        |t        j                  j                        r|n||f}t        |t        j                  j                        r|n||f}|d   |d   z  |d   |d   z  z  }|d   |d   z  |d   |d   z  f}|| _        || _        || _        || _
        || _        t        j                  ||||      | _        y )Nr   r   kernel_sizestride)r:   r;   rX   rV   num_channelsrP   rW   rY   rZ   r[   r\   patch_shaper   Conv2d
projection)	r<   rL   rX   rV   r   rP   r\   r   r=   s	           r#   r;   z&Data2VecVisionPatchEmbeddings.__init__   s   !'!2!2F4E4EJ
$*$7$79K9Kk#-j+//:R:R#SZZdfpYq
#-j+//:R:R#SZZdfpYq
!!}
15*Q-:VW=:XY!!}
15z!}
ST7UV$$(&&))L+:^hir"   r~   r'   c                    |j                   \  }}}}|| j                  k7  rt        d      | j                  |      }|j                   d   |j                   d   }}|j	                  d      j                  dd      }|||ffS )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.rg   r
   r   )r,   r   
ValueErrorr   flatten	transpose)	r<   r~   r   r   rc   rd   rb   r   r   s	            r#   rA   z%Data2VecVisionPatchEmbeddings.forward   s    2>2D2D/
L&%4,,,w  __\2
$.$4$4Q$79I9I!9Lk''*44Q:
L+666r"   )	r   r   r   r    r;   r.   rF   rA   rH   rI   s   @r#   rT   rT      s)    j"7ELL 7U\\ 7r"   rT   c                       e Zd Zddedee   ddf fdZ	 	 	 	 	 ddej                  deej                     de	d	eej                     d
e	deee
      deeej                     eej                  ej                  f   f   fdZ xZS )Data2VecVisionSelfAttentionNrL   window_sizer'   c                 <   t         |           || _        |j                  |j                  z  dk7  r2t        |d      s&t        d|j                   d|j                   d      |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _	        t        j                  |j                  | j                        | _        t        j                  |j                  | j                  d      | _        t        j                  |j                  | j                        | _        t        j                  |j                         | _        t%        |      | _        | j&                  rt)        ||      | _        y y )	Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads .F)biasr   )r:   r;   rL   rP   num_attention_headshasattrr   r   attention_head_sizeall_head_sizer   Linearquerykeyvaluer_   attention_probs_dropout_probra   r   has_relative_position_bias"Data2VecVisionRelativePositionBiasrelative_position_biasr<   rL   r   r=   s      r#   r;   z$Data2VecVisionSelfAttention.__init__   sP    : ::a?PVXhHi"6#5#5"6 7334A7 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1C%PYYv1143E3EF
zz&"E"EF*.{*;'***LVal*mD' +r"   r>   	head_maskoutput_attentionsr   r}   
resolutionc                    |j                   \  }}}	| j                  |      j                  |d| j                  | j                        j                  dd      }
| j                  |      j                  |d| j                  | j                        j                  dd      }| j                  |      j                  |d| j                  | j                        j                  dd      }t        j                  |
|j                  dd            }|t        j                  | j                        z  }| j                  r[|\  }}|| j                  j                  z  || j                  j                  z  f}|| j                  |||j                   d         z   }|||z   }t         j"                  j%                  |d      }| j'                  |      }|||z  }t        j                  ||      }|j)                  dddd      j+                         }|j-                         d d | j.                  fz   } |j                  | }|r||f}|S |f}|S )	Nrf   r   rg   dim_sizerm   r   r
   )r,   r   ru   r   r   r   r   r   r.   matmulmathsqrtr   rL   rV   r   r   rs   softmaxra   rr   
contiguousrj   r   )r<   r>   r   r   r   r}   r   r   
seq_lengthr   query_layer	key_layervalue_layerattention_scoresrc   rd   r   attention_probscontext_layernew_context_layer_shapeoutputss                        r#   rA   z#Data2VecVisionSelfAttention.forward  sD    %2$7$7!
JJJ}%T*b$":":D<T<TUYq!_ 	 HH]#T*b$":":D<T<TUYq!_ 	 JJ}%T*b$":":D<T<TUYq!_ 	 !<<Y5H5HR5PQ+dii8P8P.QQ **&MFE!T[[%;%;;UdkkF\F\=\]K/$2M2M5@S@STU@V 3N 3  
 "-/2HH --//0@b/I ,,7  -	9O_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S***,CD6G=/2 O\M]r"   r9   NFNFN)r   r   r   r   r   tupler;   r.   rF   r   r   r   rA   rH   rI   s   @r#   r   r      s    n3 n(5/ n]a n4 -1"'9=).+/>||> ELL)>  	>
 !) 6> #'> U3Z(> 
uU\\"E%,,*D$EE	F>r"   r   c                        e Zd Z	 	 	 	 	 d	dej                  deej                     dedeej                     dedeee      de	eej                     eej                  ej                  f   f   f fdZ
 xZS )
Data2VecVisionSdpaSelfAttentionr>   r   r   r   r}   r   r'   c           	         |s|*t         j                  d       t        |   ||||||      S |j                  \  }}}	| j                  |      j                  |d| j                  | j                        j                  dd      }
| j                  |      j                  |d| j                  | j                        j                  dd      }| j                  |      j                  |d| j                  | j                        j                  dd      }d }| j                  rX|\  }}|| j                  j                  z  || j                  j                  z  f}| j                  |||j                  d         }|
||}n||z  }dt!        j"                  | j                        z  }t$        j&                  j(                  j+                  |
|||| j,                  r| j                  j.                  ndd|	      }|j1                  d
ddd      j3                         }|j5                         d d | j6                  fz   } |j                  | }|d fS )Na  `Data2VecVisionSdpaSelfAttention` is used but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True` or `head_mask`. Falling back to the manual attention implementation, but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.)r>   r   r   r   r}   r   rf   r   rg   r   r)   F)	attn_mask	dropout_p	is_causalscaler   r
   r   )loggerwarning_oncer:   rA   r,   r   ru   r   r   r   r   r   r   rL   rV   r   r   r   r.   r   rs   scaled_dot_product_attentionr&   r   rr   r   rj   r   )r<   r>   r   r   r   r}   r   r   r   r   r   r   r   	attn_biasrc   rd   r   scalingr   r   r=   s                       r#   rA   z'Data2VecVisionSdpaSelfAttention.forwardK  sW    	 5w 7?+#"3'=)A% #   %2$7$7!
JJJ}%T*b$":":D<T<TUYq!_ 	 HH]#T*b$":":D<T<TUYq!_ 	 JJ}%T*b$":":D<T<TUYq!_ 	 	**&MFE!T[[%;%;;UdkkF\F\=\]K335@S@STU@V 4 I
 "- 2	33	dii 8 899++HHBF--dkk>>UX I 
 &--aAq9DDF"/"4"4"6s";t?Q?Q>S"S***,CDd""r"   r   )r   r   r   r.   rF   r   r   r   r   r   rA   rH   rI   s   @r#   r   r   J  s     -1"'9=).+/F#||F# ELL)F#  	F#
 !) 6F# #'F# U3Z(F# 
uU\\"E%,,*D$EE	FF# F#r"   r   c                   ~     e Zd ZdZdeddf fdZd	dej                  dej                  dej                  fdZ xZ	S )
Data2VecVisionSelfOutputz
    The residual connection is defined in Data2VecVisionLayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    rL   r'   Nc                     t         |           t        j                  |j                  |j                        | _        t        j                  |j                        | _        y r9   )	r:   r;   r   r   rP   denser_   r`   ra   r<   rL   r=   s     r#   r;   z!Data2VecVisionSelfOutput.__init__  sB    YYv1163E3EF
zz&"<"<=r"   r>   input_tensorc                 J    | j                  |      }| j                  |      }|S r9   r   ra   )r<   r>   r   gammas       r#   rA   z Data2VecVisionSelfOutput.forward  $    

=1]3r"   r9   )
r   r   r   r    r   r;   r.   rF   rA   rH   rI   s   @r#   r   r     sE    
>3 > >
U\\  ^c^j^j r"   r   )eagersdpac                        e Zd Zddedee   ddf fdZd Z	 	 	 	 	 ddej                  deej                     d	e
d
ed   de
deee      deeej                     eej                  ej                  f   f   fdZ xZS )Data2VecVisionAttentionNrL   r   r'   c                     t         |           t        |j                     ||      | _        t        |      | _        t               | _        y )Nr   )	r:   r;   &DATA2VEC_VISION_SELF_ATTENTION_CLASSES_attn_implementation	attentionr   r4   setpruned_headsr   s      r#   r;   z Data2VecVisionAttention.__init__  sB    ?@[@[\
 /v6Er"   c                 >   t        |      dk(  ry t        || j                  j                  | j                  j                  | j
                        \  }}t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _	        t        | j                  j                  |d      | j                  _        | j                  j                  t        |      z
  | j                  _        | j                  j                  | j                  j                  z  | j                  _        | j
                  j                  |      | _        y )Nr   r   rm   )lenr   r   r   r   r   r   r   r   r   r4   r   r   union)r<   headsindexs      r#   prune_headsz#Data2VecVisionAttention.prune_heads  s   u:?74>>55t~~7Y7Y[_[l[l
u
  2$..2F2FN/0B0BEJ1$..2F2FN.t{{/@/@%QO .2^^-O-ORUV[R\-\*'+~~'I'IDNNLnLn'n$ --33E:r"   r>   r   r   r   r   r}   r   c                 n    | j                  ||||||      }| j                  |d   |      }|f|dd  z   }	|	S )Nr   r   )r   r4   )
r<   r>   r   r   r   r}   r   self_outputsattention_outputr   s
             r#   rA   zData2VecVisionAttention.forward  sS     ~~9&79OQiku
  ;;|AF#%QR(88r"   r9   r   )r   r   r   r   r   r   r;   r   r.   rF   r   r   r   rA   rH   rI   s   @r#   r   r     s    "3 "(5/ "]a ";* -1"'QU).+/|| ELL)  	
 !))M N #' U3Z( 
uU\\"E%,,*D$EE	Fr"   r   c                   `     e Zd Zdeddf fdZdej                  dej                  fdZ xZS )Data2VecVisionIntermediaterL   r'   Nc                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r9   )r:   r;   r   r   rP   intermediate_sizer   rW   
hidden_actrG   r   intermediate_act_fnr   s     r#   r;   z#Data2VecVisionIntermediate.__init__  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r"   r>   c                 J    | j                  |      }| j                  |      }|S r9   )r   r   r@   s     r#   rA   z"Data2VecVisionIntermediate.forward  s&    

=100?r"   	r   r   r   r   r;   r.   rF   rA   rH   rI   s   @r#   r   r     s2    93 9 9U\\ ell r"   r   c                   `     e Zd Zdeddf fdZdej                  dej                  fdZ xZS )Data2VecVisionOutputrL   r'   Nc                     t         |           t        j                  |j                  |j
                        | _        t        j                  |j                        | _	        y r9   )
r:   r;   r   r   r   rP   r   r_   r`   ra   r   s     r#   r;   zData2VecVisionOutput.__init__  sB    YYv779K9KL
zz&"<"<=r"   r>   c                 J    | j                  |      }| j                  |      }|S r9   r   r@   s     r#   rA   zData2VecVisionOutput.forward  r   r"   r   rI   s   @r#   r   r     s2    >3 > >
U\\ ell r"   r   c                       e Zd ZdZ	 ddedee   deddf fdZ	 	 	 	 	 dde	j                  d	ee	j                     d
edee	j                     dedeeeef      deee	j                     ee	j                  e	j                  f   f   fdZ xZS )Data2VecVisionLayerz?This corresponds to the Block class in the timm implementation.NrL   r   drop_path_rater'   c                    t         |           |j                  | _        d| _        t	        ||      | _        t        |      | _        t        |      | _	        t        j                  |j                  |j                        | _        |dkD  rt        |      nt        j                          | _        t        j                  |j                  |j                        | _        |j&                  }|dkD  ryt        j(                  |t+        j,                  |j                        z  d      | _        t        j(                  |t+        j,                  |j                        z  d      | _        y d\  | _        | _        y )	Nr   r   epsr)   r   T)requires_gradr   )r:   r;   chunk_size_feed_forwardseq_len_dimr   r   r   intermediater   r4   r   	LayerNormrP   layer_norm_epslayernorm_beforer7   Identityr5   layernorm_afterlayer_scale_init_valuerN   r.   oneslambda_1lambda_2)r<   rL   r   r  init_valuesr=   s        r#   r;   zData2VecVisionLayer.__init__  s    	'-'E'E$0[Q6v>*62 "V-?-?VEZEZ [CQTWCW/?]_]h]h]j!||F,>,>FDYDYZ33?LLuzz&BTBT7U)UeijDMLLuzz&BTBT7U)UeijDM+5(DM4=r"   r>   r   r   r   r}   r   c                    | j                  | j                  |      |||||      }|d   }|dd  }	| j                  | j                  |z  }| j                  |      |z   }| j	                  |      }
| j                  |
      }
| j                  |
      }
| j                  | j                  |
z  }
| j                  |
      |z   }
|
f|	z   }	|	S )N)r   r   r}   r   r   r   )r   r  r  r5   r  r	  r4   r  )r<   r>   r   r   r   r}   r   self_attention_outputsr   r   layer_outputs              r#   rA   zData2VecVisionLayer.forward  s     "&!!-0/#9%=! "0 "
 2!4(, ==$#}}/?? '78=H ++M:((6{{<0==$==<7L ~~l3mC/G+r"   )Nr)   r   )r   r   r   r    r   r   r   rE   r;   r.   rF   r   r   r   rA   rH   rI   s   @r#   r  r    s    I jm6*69A%6af6	6. -1"'9=).04)||) ELL))  	)
 !) 6) #') U38_-) 
uU\\"E%,,*D$EE	F)r"   r  c                        e Zd Zdededdf fdZ ed      deeef   dej                  fd       Z
dd	edej                  fd
Z xZS )r   rL   r   r'   Nc                     t         |           || _        d|d   z  dz
  d|d   z  dz
  z  dz   | _        t	        j
                  t        j                  | j                  |j                              | _	        y )Nrg   r   r   r
   )
r:   r;   r   num_relative_distancer   rN   r.   rO   r   relative_position_bias_tabler   s      r#   r;   z+Data2VecVisionRelativePositionBias.__init__A  sr    &&'+a.&81&<[QR^ASVWAW%X[\%\",.LLKK22F4N4NO-
)r"   
   )maxsizec                    d|d   z  dz
  d|d   z  dz
  z  dz   }|d   |d   z  }t        j                  t        j                  |d         t        j                  |d         d      }t        j                  |      }t        j                  |d      }|dddddf   |dddddf   z
  }|j                  ddd      j                         }|dddddfxx   |d   dz
  z  cc<   |dddddfxx   |d   dz
  z  cc<   |dddddfxx   d|d   z  dz
  z  cc<   t        j                  |dz   fdz  |j                        }|j                  d	      |ddddf<   |dz
  |dddf<   |dz
  |dddf<   |dz
  |d
<   |S )z
        This method creates the relative position index, modified to support arbitrary window sizes,
        as introduced in [MiDaS v3.1](https://huggingface.co/papers/2307.14460).
        rg   r   r   r
   ij)indexingN)rj   r*   rf   )r   r   )
r.   meshgridarangestackr   rr   r   rO   r*   sum)	r<   r   r  window_areagridcoordscoords_flattenrelative_coordsrelative_position_indexs	            r#    generate_relative_position_indexzCData2VecVisionRelativePositionBias.generate_relative_position_indexJ  s    "#[^!3a!7AA<NQR<R SVW W "!n{1~5~~ell;q>:ELLUV<XcghT"vq1(At4~aqj7QQ)11!Q:EEG1a KNQ$66 1a KNQ$66 1a AA$6$:: "'++K!O3E3IQ`QfQf"g*9*=*=b*AAB')>)B12&)>)BA&(=(A%&&r"   r}   c                    d| j                   d   z  dz
  }d| j                   d   z  dz
  }d|d   z  dz
  }d|d   z  dz
  }| j                  }| j                  }	||z  dz   }
|d|	dz
   }|j                  d||d      j	                  dddd      }t
        j                  j                  |t        |      t        |      fd      }|j	                  dddd      j                  |
dz
  d      }t        j                  |||	dz
  d g      }| j                  |      }||j                  d         }|j                  |d   |d   z  dz   |d   |d   z  dz   d      }|j	                  ddd      j                         }|rCt
        j                  j                  |j                  d      ||fdd	
      j                  d      }|j                  d      S )zu
        Modification of timm.models.beit.py: Attention._get_rel_pos_bias to support arbitrary window sizes.
        rg   r   r   r
   Nrf   bilinear)rj   rk   Fri   )r   r  r  rq   rr   r   rs   rt   r   r.   rv   r*  ru   r   r   squeeze)r<   r   r}   r   
old_height	old_widthrz   r{    old_relative_position_bias_tableold_num_relative_distancenew_num_relative_distanceold_sub_tablenew_sub_table new_relative_position_bias_tabler)  r   s                   r#   rA   z*Data2VecVisionRelativePositionBias.forwardc  s-    ))!,,q0
((++a/	Q'!+
A&*	+/+L+L($($>$>!$.$:Q$>!89X;TWX;XY%--aJKSSTUWXZ[]^_11:!6	)8L MT^ 2 
 &--aAq9AAB[^_B_acd+099<=VYZ=Z=\]^,
( #'"G"G"T!ABYB^B^_aBb!c "8!<!<N[^+a/Q+a.1PST1TVX"
 "8!?!?1a!H!S!S!U#%']]%>%>&003)#	 &? &
 gaj # &//22r"   )FN)r   r   r   r   r   r;   r   r   r.   rF   r*  r   rA   rH   rI   s   @r#   r   r   @  sn    
3 
% 
D 
 )4'E#s(O 'PUP\P\ ' 5'0-3T -3]b]i]i -3r"   r   c                        e Zd Zddedee   ddf fdZ	 	 	 	 	 	 ddej                  deej                     de	d	e	d
e	deee
e
f      de	deeef   fdZ xZS )Data2VecVisionEncoderNrL   r   r'   c                    t         |           || _        |j                  | _        | j                  rt        ||      | _        t        j                  d|j                  |j                  d      D cg c]  }|j                          }}t        j                  t        |j                        D cg c]!  }t        ||j                   r|nd ||         # c}      | _        d| _        y c c}w c c}w )Nr   r   cpu)r+   )r   r  F)r:   r;   rL   !use_shared_relative_position_biasr   r   r   r.   linspacer  num_hidden_layersitemr   
ModuleListranger  use_relative_position_biaslayergradient_checkpointing)r<   rL   r   xdprir=   s         r#   r;   zData2VecVisionEncoder.__init__  s    *0*R*R'***LVal*mD' "'63H3H&JbJbkp!qrAqvvxrr]] v778  $/5/P/PVZ#&q6	

 ',# ss   5C.4&C3r>   r   r   output_hidden_statesr}   r   return_dictc           	         |rdnd }|rdnd }	t        | j                        D ]  \  }
}|r||fz   }| j                  rY|\  }}|| j                  j                  z  || j                  j                  z  f}| j                  |||j                  d         }nd }|||
   nd } |||||||      }|d   }|s|	|d   fz   }	 |r||fz   }|st        d |||	fD              S t        |||	      S )Nr!   r   )r}   r   )r   r   r   r}   r   r   c              3   &   K   | ]	  }||  y wr9   r!   ).0vs     r#   	<genexpr>z0Data2VecVisionEncoder.forward.<locals>.<genexpr>  s     mq_`_lms   )last_hidden_stater>   
attentions)		enumeraterA  r   rL   rV   r   r,   r   r   )r<   r>   r   r   rF  r}   r   rG  all_hidden_statesall_self_attentionsrE  layer_modulerc   rd   r   r   layer_head_masklayer_outputss                     r#   rA   zData2VecVisionEncoder.forward  sI    #7BD$5b4(4 	POA|#$58H$H!.. *%)?)??$++J`J`A`a)-)D)D:R]j]p]pqr]s *E *& *.&.7.CilO()"3'=)A%M *!,M &9]1=M<O&O#7	P:   1]4D Dm]4EGZ$[mmm++*
 	
r"   r9   )NFFFNT)r   r   r   r   r   r   r;   r.   rF   r   r   r   r   rA   rH   rI   s   @r#   r7  r7    s    ,3 ,(5/ ,]a ,0 -1"'%*).04 3
||3
 ELL)3
  	3

 #3
 #'3
 U38_-3
 3
 
uo%	&3
r"   r7  c                   :    e Zd ZU eed<   dZdZdZdgZdgZ	dZ
d Zy)	Data2VecVisionPreTrainedModelrL   data2vec_visionr~   Tr  z.*relative_position_index.*c                    t        |t        j                  t        j                  t        j                  f      rm|j
                  j                  j                  d| j                  j                         |j                  %|j                  j                  j                          yyt        |t        j                        rz|j
                  j                  j                  d| j                  j                         |j                  2|j
                  j                  |j                     j                          yyt        |t        j                        rJ|j                  j                  j                          |j
                  j                  j                  d       yt        |t               r|j"                  j                  j                          |j$                  $|j$                  j                  j                          |j&                  %|j&                  j                  j                          yyt        |t(              r%|j*                  j                  j                          yt        |t,              r|j.                  s|j.                  j                  j                  | j                  j0                         |j2                  j                  j                  | j                  j0                         yyy)zInitialize the weightsr)   )meanstdNg      ?)rW   r   r   r   ConvTranspose2dweightdatanormal_rL   initializer_ranger   zero_	Embeddingpadding_idxr
  fill_rK   rQ   rS   r^   r   r  r  r  r  r  )r<   modules     r#   _init_weightsz+Data2VecVisionPreTrainedModel._init_weights  s   fryy"))R5G5GHI MM&&CT[[5R5R&S{{&  &&( '-MM&&CT[[5R5R&S!!-""6#5#56<<> .-KK""$MM$$S) 89!!'')  ,!!&&,,.))5**//557 6 BC//44::< 34*$$**4;;+M+MN$$**4;;+M+MN + 5r"   N)r   r   r   r   __annotations__base_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modules"_keys_to_ignore_on_load_unexpected_supports_sdpare  r!   r"   r#   rV  rV    s8     ! )$O&*#./*H)I&NOr"   rV  c                        e Zd Zddededdf fdZd Zd Ze	 	 	 	 	 	 dde	j                  d	ee	j                     d
ee	j                     dee   dee   dedee   deeef   fd       Z xZS )Data2VecVisionModelrL   add_pooling_layerr'   Nc                    t         |   |       || _        t        |      | _        t        || j                  j                  j                        | _        |j                  rt        j                         n*t        j                  |j                  |j                        | _        |rt!        |      nd| _        | j%                          y)zw
        add_pooling_layer (bool, *optional*, defaults to `False`):
            Whether to add a pooling layer
        r   r  N)r:   r;   rL   rK   rb   r7  rU   r   encoderuse_mean_poolingr   r  r
  rP   r  	layernormData2VecVisionPoolerpooler	post_init)r<   rL   ro  r=   s      r#   r;   zData2VecVisionModel.__init__  s    
 	 26:,VAaAaAmAmn $44BKKM",,vGYGY_e_t_t:u 	 7H*62T 	r"   c                 .    | j                   j                  S r9   )rb   rU   rC   s    r#   get_input_embeddingsz(Data2VecVisionModel.get_input_embeddings  s    ///r"   c                     |j                         D ]7  \  }}| j                  j                  |   j                  j	                  |       9 y)z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsrq  rA  r   r   )r<   heads_to_prunerA  r   s       r#   _prune_headsz Data2VecVisionModel._prune_heads!  sE    
 +002 	CLE5LLu%//;;EB	Cr"   r~   r   r   r   rF  r}   rG  c           	      :   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j	                  || j                   j
                        }| j                  ||      \  }}	|j                  dd }
| j                  |||||
||      }|d   }| j                  |      }| j                  | j                  |      nd}|s|||fn|f}||dd z   S t        |||j                  |j                        S )z
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        N)r   rg   )r   r   rF  r   rG  r}   r   r   )rM  pooler_outputr>   rN  )rL   r   rF  use_return_dictget_head_maskr<  rb   r,   rq  rs  ru  r   r>   rN  )r<   r~   r   r   r   rF  r}   rG  embedding_outputr   r   encoder_outputssequence_outputpooled_outputhead_outputss                  r#   rA   zData2VecVisionModel.forward)  sE    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] &&y$++2O2OP	"oolOo\!!''+
,,/!5!#%= ' 
 *!,..98<8OO4UY?L?XO];_n^pL/!""5553-')77&11	
 	
r"   )F)NNNNFN)r   r   r   r   r   r;   rx  r|  r   r.   rF   r   r   r   r   r   rA   rH   rI   s   @r#   rn  rn    s    3  Y] &0C  7;,0,0/3).&*4
ll4
 "%"2"234
 ELL)	4

 $D>4
 'tn4
 #'4
 d^4
 
u::	;4
 4
r"   rn  c                   `     e Zd Zdeddf fdZdej                  dej                  fdZ xZS )rt  rL   r'   Nc                     t         |           |j                  r1t        j                  |j
                  |j                        | _        y d | _        y )Nr  )r:   r;   rr  r   r
  rP   r  rs  r   s     r#   r;   zData2VecVisionPooler.__init__c  sA    KQKbKbBLL++1F1FG 	hl 	r"   r>   c                     | j                   0|d d dd d d f   }| j                  |j                  d            }|S |d d df   }|S )Nr   r   )rs  rY  )r<   r>   patch_tokensr  s       r#   rA   zData2VecVisionPooler.forwardi  sU    >>%(AB2L NN<+<+<Q+?@M
  *!Q$/Mr"   r   rI   s   @r#   rt  rt  b  s2    
3 
 
	U\\ 	ell 	r"   rt  z
    Data2VecVision Model transformer with an image classification head on top (a linear layer on top of the average of
    the final hidden states of the patch tokens) e.g. for ImageNet.
    c                        e Zd Zdeddf fdZe	 	 	 	 	 	 	 ddeej                     deej                     deej                     dee	   d	ee	   d
e	dee	   de
eef   fd       Z xZS )$Data2VecVisionForImageClassificationrL   r'   Nc                 .   t         |   |       |j                  | _        t        |d      | _        |j                  dkD  r*t        j                  |j                  |j                        nt        j                         | _	        | j                          y )NTro  r   )r:   r;   
num_labelsrn  rW  r   r   rP   r  
classifierrv  r   s     r#   r;   z-Data2VecVisionForImageClassification.__init__}  st      ++26TR OUN_N_bcNc"))F$6$68I8IJikititiv 	r"   r~   r   labelsr   rF  r}   rG  c                 4   ||n| j                   j                  }| j                  ||||||      }|r|j                  n|d   }	| j	                  |	      }
d}|| j                   j
                  | j                  dk(  rd| j                   _        nl| j                  dkD  rL|j                  t        j                  k(  s|j                  t        j                  k(  rd| j                   _        nd| j                   _        | j                   j
                  dk(  rIt               }| j                  dk(  r& ||
j                         |j                               }n ||
|      }n| j                   j
                  dk(  r=t               } ||
j                  d| j                        |j                  d            }n,| j                   j
                  dk(  rt               } ||
|      }|s|
f|dd z   }||f|z   S |S t!        ||
|j"                  |j$                  	      S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr   r   rF  r}   rG  r   
regressionsingle_label_classificationmulti_label_classificationrf   rg   losslogitsr>   rN  )rL   r  rW  r~  r  problem_typer  r*   r.   longr   r	   r-  r   ru   r   r   r>   rN  )r<   r~   r   r  r   rF  r}   rG  r   r  r  r  loss_fctr4   s                 r#   rA   z,Data2VecVisionForImageClassification.forward  s   " &1%<k$++B]B]&&/!5%=# ' 
 2=--'!*/{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE$!//))	
 	
r"   NNNNNFN)r   r   r   r   r;   r   r   r.   rF   r   r   r   r   rA   rH   rI   s   @r#   r  r  u  s    
3 
 
  04,0)-,0/3).&*=
u||,=
 ELL)=
 &	=

 $D>=
 'tn=
 #'=
 d^=
 
u++	,=
 =
r"   r  c                        e Zd ZdZ	 	 	 ddededeeeeef   f   deeeeef   ef   dedeeeeef   f   dd	f fd
Z	de
j                  de
j                  fdZ xZS )Data2VecVisionConvModuleaD  
    A convolutional block that bundles conv/norm/activation layers. This block simplifies the usage of convolution
    layers, which are commonly used with a norm layer (e.g., BatchNorm) and activation layer (e.g., ReLU).

    Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
    in_channelsout_channelsr   paddingr   dilationr'   Nc                     t         |           t        j                  ||||||      | _        t        j
                  |      | _        t        j                         | _        y )N)r  r  r   r  r   r  )	r:   r;   r   r   convBatchNorm2dbnReLU
activation)r<   r  r  r   r  r   r  r=   s          r#   r;   z!Data2VecVisionConvModule.__init__  sQ     	II#%#
	 ...'')r"   r$   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r9   )r  r  r  )r<   r$   r4   s      r#   rA   z Data2VecVisionConvModule.forward  s0    5!(r"   )r   Fr   )r   r   r   r    r   r   r   rG   r   r;   r.   rF   rA   rH   rI   s   @r#   r  r    s     5601$$ $ 3c3h/0	$
 sE#s(OS01$ $ U38_,-$ 
$*U\\ ell r"   r  c                   h     e Zd Zdedededdf fdZdej                  dej                  fdZ xZS )	!Data2VecVisionPyramidPoolingBlock
pool_scaler  channelsr'   Nc                     t         |           t        j                  |      t	        ||d      g| _        t        | j
                        D ]   \  }}| j                  t        |      |       " y )Nr   r   )	r:   r;   r   AdaptiveAvgPool2dr  layersrO  
add_modulerG   )r<   r  r  r  rE  rA  r=   s         r#   r;   z*Data2VecVisionPyramidPoolingBlock.__init__  sa      ,$[(J
 "$++. 	+HAuOOCFE*	+r"   r$   c                 <    |}| j                   D ]
  } ||      } |S r9   )r  )r<   r$   hidden_staterA  s       r#   rA   z)Data2VecVisionPyramidPoolingBlock.forward  s*    [[ 	/E .L	/r"   )	r   r   r   r   r;   r.   rF   rA   rH   rI   s   @r#   r  r    s?    +3 +S +C +D +U\\ ell r"   r  c            
            e Zd ZdZdeedf   dedededdf
 fd	Zd
ej                  de
ej                     fdZ xZS )"Data2VecVisionPyramidPoolingModulea  
    Pyramid Pooling Module (PPM) used in PSPNet.

    Args:
        pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid
            Module.
        in_channels (int): Input channels.
        channels (int): Channels after modules, before conv_seg.
        align_corners (bool): align_corners argument of F.interpolate.

    Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
    pool_scales.r  r  rl   r'   Nc                    t         |           || _        || _        || _        || _        g | _        t        |      D ]I  \  }}t        |||      }| j                  j                  |       | j                  t        |      |       K y )N)r  r  r  )r:   r;   r  rl   r  r  blocksrO  r  appendr  rG   )	r<   r  r  r  rl   rE  r  blockr=   s	           r#   r;   z+Data2VecVisionPyramidPoolingModule.__init__  s    &*& &{3 	+MAz5%;E KKu%OOCFE*	+r"   rC  c                     g }| j                   D ]Y  } ||      }t        j                  j                  ||j	                         dd  d| j
                        }|j                  |       [ |S )Nrg   r,  ri   )r  r   rs   rt   rj   rl   r  )r<   rC  ppm_outsppmppm_outupsampled_ppm_outs         r#   rA   z*Data2VecVisionPyramidPoolingModule.forward  sn    ;; 	/C!fG " 9 9affhqrl4K]K] !: ! OO-.	/ r"   )r   r   r   r    r   r   r   r;   r.   rF   listrA   rH   rI   s   @r#   r  r    s[    +E#s(O +# +QT +ei +nr + $u||*< r"   r  c                   j     e Zd ZdZdeddf fdZd Zdej                  dej                  fdZ	 xZ
S )	Data2VecVisionUperHeadz
    Unified Perceptual Parsing for Scene Understanding. This head is the implementation of
    [UPerNet](https://huggingface.co/papers/1807.10221).

    Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
    rL   r'   Nc                    t         |           |j                  | _        |j                  gdz  | _        |j                  | _        d| _        t        j                  | j
                  |j                  d      | _
        t        | j                  | j                  d   | j
                  | j                        | _        t        | j                  d   t        | j                        | j
                  z  z   | j
                  dd      | _        t        j                          | _        t        j                          | _        | j                  d d D ]s  }t        || j
                  d      }t        | j
                  | j
                  dd      }| j"                  j'                  |       | j$                  j'                  |       u t        t        | j                        | j
                  z  | j
                  dd      | _        y )	N   Fr   r  rf   )rl   r
   r   r  )r:   r;   r  rP   r  r  rl   r   r   r  r  r  psp_modulesr  r   
bottleneckr>  lateral_convs	fpn_convsr  fpn_bottleneck)r<   rL   r  l_convfpn_convr=   s        r#   r;   zData2VecVisionUperHead.__init__3  s   !--"../!3**"))DMM63D3DRST >R MM,,	
 3R 3t'7'7#84==#HHMM	
  ]]_++CR0 	,K-k4==VWXF/t}}Z[efgH%%f-NN!!(+		, 7  !DMM1MM	
r"   c                     |d   }|g}|j                  | j                  |             t        j                  |d      }| j	                  |      }|S )Nrf   r   rm   )extendr  r.   rv   r  )r<   inputsrC  psp_outsr4   s        r#   psp_forwardz"Data2VecVisionUperHead.psp_forwardY  sL    2J3((+,99X1-*r"   encoder_hidden_statesc                 P   t        | j                        D cg c]  \  }} |||          }}}|j                  | j                  |             t	        |      }t        |dz
  dd      D ]V  }||dz
     j                  dd  }||dz
     t        j                  j                  ||   |d| j                        z   ||dz
  <   X t        |dz
        D cg c]  } | j                  |   ||          }}|j                  |d          t        |dz
  dd      D ]E  }t        j                  j                  ||   |d   j                  dd  d| j                        ||<   G t        j                  |d      }| j                  |      }| j                  |      }|S c c}}w c c}w )Nr   r   rf   rg   r,  ri   rm   )rO  r  r  r  r   r?  r,   r   rs   rt   rl   r  r.   rv   r  r  )	r<   r  rE  lateral_convlateralsused_backbone_levels
prev_shapefpn_outsr4   s	            r#   rA   zData2VecVisionUperHead.forwardb  s   R[\`\n\nRopq,L!6q!9:pp(()>?@  #8}+a/B7 	A!!a%..qr2J&q1uo0I0I*:TM_M_ 1J 1 HQUO	 =BBVYZBZ<[\q%DNN1%hqk2\\%+a/B7 	A--33(1+"3"3AB"7jX\XjXj 4 HQK	 99X1-$$X.(3 q ]s   FF#)r   r   r   r    r   r;   r  r.   rF   rA   rH   rI   s   @r#   r  r  +  s=    $
3 $
 $
LU\\ ell r"   r  c                        e Zd ZdZ	 	 	 ddedededeeeeef   f   ddf
 fdZd	e	j                  de	j                  fd
Z xZS )Data2VecVisionFCNHeada  
    Fully Convolution Networks for Semantic Segmentation. This head is implemented of
    [FCNNet](https://huggingface.co/papers/1411.4038>).

    Args:
        config (Data2VecVisionConfig): Configuration.
        in_channels
        kernel_size (int): The kernel size for convs in the head. Default: 3.
        dilation (int): The dilation rate for convs in the head. Default: 1.


    Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
    rL   in_indexr   r  r'   Nc           
      <   t         |           |j                  | _        |j                  | _        |j                  | _        |j                  | _	        || _
        |dz  |z  }g }|j                  t        | j                  | j
                  |||             t        | j                  dz
        D ]5  }|j                  t        | j
                  | j
                  |||             7 | j                  dk(  rt        j                         | _        nt        j"                  | | _        | j                  r8t        | j                  | j
                  z   | j
                  ||dz        | _        t        j&                  | j
                  |j(                  d      | _        y )Nrg   )r   r  r  r   r   r  r  )r:   r;   rP   r  auxiliary_channelsr  auxiliary_num_convs	num_convsauxiliary_concat_inputconcat_inputr  r  r  r?  r   r  convs
Sequentialconv_catr   r  r  )	r<   rL   r  r   r  conv_paddingr  rE  r=   s	           r#   r;   zData2VecVisionFCNHead.__init__  sX    	!--1133"99 #q(H4$  $--[R^iq	

 t~~)* 	ALL(MM4==kS_jr	 >>QDJ.DJ4  4==0$--[bmqrbrDM ))DMM63D3DRSTr"   r  c                     || j                      }| j                  |      }| j                  r(| j                  t	        j
                  ||gd            }| j                  |      }|S )Nr   rm   )r  r  r  r  r.   rv   r  )r<   r  r>   r4   s       r#   rA   zData2VecVisionFCNHead.forward  sX    -dmm<M*]]599mV-D!#LMF(r"   )rg   r
   r   )r   r   r   r    r   r   r   r   r;   r.   rF   rA   rH   rI   s   @r#   r  r    s|    " 01$U$$U $U 	$U
 U38_,-$U 
$ULU\\ ell r"   r  c                        e Zd Zdeddf fdZd Ze	 	 	 	 	 	 	 ddeej                     deej                     deej                     d	ee
   d
ee
   de
dee
   deeef   fd       Z xZS )%Data2VecVisionForSemanticSegmentationrL   r'   Nc                 x   t         |   |       |j                  | _        t        |d      | _        t        | j                  j                        dk7  rt        d      t        j                  t        j                  |j                  |j                  dd      t        j                  |j                        t        j                         t        j                  |j                  |j                  dd            | _        t        j                  t        j                  |j                  |j                  dd            | _        t        j"                         | _        t        j&                  dd      | _        t+        |      | _        |j.                  rt1        |      nd | _        | j5                          y )NFr  r  zData2VecVisionForSemanticSegmentation requires config.out_indices to be a list of 4 integers, specifying which features to use from the backbone. One can use [3, 5, 7, 11] in case of a base-sized architecture.rg   r   )r:   r;   r  rn  rW  r   rL   out_indicesr   r   r  r[  rP   r  GELUfpn1fpn2r  fpn3	MaxPool2dfpn4r  decode_headuse_auxiliary_headr  auxiliary_headrv  r   s     r#   r;   z.Data2VecVisionForSemanticSegmentation.__init__  sQ     ++26US t{{&&'1,- 
 MMv1163E3EST]^_NN6--.GGIv1163E3EST]^_	
	 MMv1163E3EST]^_
	 KKM	LLQq9	 2&9?E?X?X3F;^b 	r"   c                 n   t         j                  j                  ||j                  dd  dd      }|0t         j                  j                  ||j                  dd  dd      }t	        | j
                  j                        } |||      }|}|% ||      }	|| j
                  j                  |	z  z  }|S )Nr   r,  Fri   )ignore_index)r   rs   rt   r,   r   rL   semantic_loss_ignore_indexauxiliary_loss_weight)
r<   r  auxiliary_logitsr  upsampled_logitsupsampled_auxiliary_logitsr  	main_lossr  auxiliary_losss
             r#   compute_lossz2Data2VecVisionForSemanticSegmentation.compute_loss  s    ==44bc*5 5 
 ')+)B)B v||BC'8zY^ *C *& $1W1WX-v6	'%&@&INDKK55FFDr"   r~   r   r  r   rF  r}   rG  c           	      T   ||n| j                   j                  }||n| j                   j                  }|$| j                   j                  dk(  rt	        d      | j                  |||d||      }|r|j                  n|d   }	t        |	      D 
cg c]#  \  }
}|
dz   | j                   j                  v s"|% }}
}|j                  d   }| j                   j                  | j                   j                  z  }|D cg c]3  }|ddddddf   j                  ddd      j                  |d||      5 }}| j                  | j                  | j                   | j"                  g}t%        t'        |            D ]  } ||   ||         ||<    | j)                  |      }d}| j*                  | j+                  |      }d}|| j-                  |||      }|s|r
|f|dd z   }n	|f|dd z   }||f|z   S |S t/        |||r|j                  nd|j0                  	      S c c}}
w c c}w )
a@  
        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, Data2VecVisionForSemanticSegmentation
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("facebook/data2vec-vision-base")
        >>> model = Data2VecVisionForSemanticSegmentation.from_pretrained("facebook/data2vec-vision-base")

        >>> inputs = image_processor(images=image, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> # logits are of shape (batch_size, num_labels, height, width)
        >>> logits = outputs.logits
        ```Nr   z/The number of labels should be greater than oneTr  r   rg   rf   r  )rL   r  rF  r  r   rW  r>   rO  r  r,   rX   rV   rr   rq   r  r  r  r  r?  r   r  r  r  r   rN  )r<   r~   r   r  r   rF  r}   rG  r   r  idxfeaturefeaturesr   patch_resolutionrC  opsrE  r  r  r  r4   s                         r#   rA   z-Data2VecVisionForSemanticSegmentation.forward  sd   D &1%<k$++B]B]$8$D $++JjJj 	 $++"8"8A"=NOO&&/!%%=# ' 
 :E 5 5'RS* 1::O0PwWTWZ[T[_c_j_j_v_vTvGww!''*
;;11T[[5K5KKnv
ijAaQhK1a(00RAQScd
 

 yy$))TYY		:s8}% 	.A #a&!-HQK	. !!(+*#228<$$V-=vFD# WQR[0 WQR[0)-)9TGf$EvE&3G'//T))	
 	
; x
s   #H7H>8H%r  )r   r   r   r   r;   r  r   r   r.   rF   r   r   r   r   rA   rH   rI   s   @r#   r  r    s    3  @&  04,0)-,0/3).&*X
u||,X
 ELL)X
 &	X

 $D>X
 'tnX
 #'X
 d^X
 
u--	.X
 X
r"   r  )r  r  rn  rV  )r)   F)Gr    collections.abcrY   r   r   dataclassesr   typingr   r   r.   torch.utils.checkpointr   torch.nnr   r   r	   activationsr   modeling_layersr   modeling_outputsr   r   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   r   configuration_data2vec_visionr   
get_loggerr   r   r   rF   rE   r   r5   Moduler7   rK   rT   r   r   r   r   r   r   r   r  r   r7  rV  rn  rt  r  r  r  r  r  r  r  __all__r!   r"   r#   <module>r     s   $    ! "    A A ! 9  . v v 7 7 ? 
		H	% +E  U\\ e T V[VbVb *%RYY %c7ryy c7N#7BII #7NV")) VtG#&A G#Vryy & )+* &+bii +^ "
299 
@4 @HP3 P3hI
BII I
X #OO #O #OL T
7 T
 T
p299 & K
+H K
K
^"ryy "L		 $$ $PRRYY Rl<BII <~ M
,I M
 M
`r"   