
    rhD                        d Z ddlZddlmZ ddlmZmZmZ ddl	Z	ddl
Z	ddl	mZ ddlmZ ddlmZ dd	lmZ dd
lmZmZmZ ddlmZmZ ddlmZmZ ddlmZmZmZm Z  ddl!m"Z" ddl#m$Z$  ejJ                  e&      Z'e ed       G d de                    Z(e ed       G d de                    Z) G d dejT                        Z+ G d dejT                        Z, G d dejT                        Z-	 dSdejT                  de	j\                  d e	j\                  d!e	j\                  d"ee	j\                     d#e/d$e/fd%Z0 G d& d'ejT                        Z1 G d( d)ejT                        Z2 G d* d+ejT                        Z3 G d, d-ejT                        Z4 G d. d/ejT                        Z5 G d0 d1e      Z6 G d2 d3ejT                        Z7 G d4 d5ejT                        Z8d6 Z9 G d7 d8ejT                        Z: G d9 d:ejT                        Z; G d; d<ejT                        Z< G d= d>ejT                        Z=e G d? d@e             Z>e G dA dBe>             Z? G dC dDejT                        Z@ G dE dFejT                        ZA G dG dHejT                        ZB edI       G dJ dKe>             ZC G dL dMejT                        ZD G dN dOejT                        ZEe G dP dQe>             ZFg dRZGy)TzPyTorch DPT (Dense Prediction Transformers) model.

This implementation is heavily inspired by OpenMMLab's implementation, found here:
https://github.com/open-mmlab/mmsegmentation/blob/master/mmseg/models/decode_heads/dpt_head.py.

    N)	dataclass)CallableOptionalUnion)nn)CrossEntropyLoss   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputDepthEstimatorOutputSemanticSegmenterOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputauto_docstringlogging	torch_int)load_backbone   )	DPTConfigz
    Base class for model's outputs that also contains intermediate activations that can be used at later stages. Useful
    in the context of Vision models.:
    )custom_introc                   l    e Zd ZU dZdZeej                     ed<   dZ	ee
ej                  df      ed<   y)*BaseModelOutputWithIntermediateActivationsak  
    last_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model.
    intermediate_activations (`tuple(torch.FloatTensor)`, *optional*):
        Intermediate activations that can be used to compute hidden states of the model at various layers.
    Nlast_hidden_states.intermediate_activations)__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__r   tuple     w/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/dpt/modeling_dpt.pyr   r   ,   s?     7;!2!23:HLhuU->->-C'DELr(   r   z
    Base class for model's outputs that also contains a pooling of the last hidden states as well as intermediate
    activations that can be used by the model at later stages.
    c                       e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eeej                  df      ed<   dZeeej                  df      ed<   dZeeej                  df      ed<   y)	4BaseModelOutputWithPoolingAndIntermediateActivationsa  
    pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
        Last layer hidden-state of the first token of the sequence (classification token) after further processing
        through the layers used for the auxiliary pretraining task. E.g. for BERT-family of models, this returns
        the classification token after processing through a linear layer and a tanh activation function. The linear
        layer weights are trained from the next sentence prediction (classification) objective during pretraining.
    intermediate_activations (`tuple(torch.FloatTensor)`, *optional*):
        Intermediate activations that can be used to compute hidden states of the model at various layers.
    Nlast_hidden_statepooler_output.hidden_states
attentionsr   )r   r    r!   r"   r,   r   r#   r$   r%   r-   r.   r&   r/   r   r'   r(   r)   r+   r+   ?   s     6:x 1 12915M8E--.5=AM8E%"3"3S"89:A:>Ju00#567>HLhuU->->-C'DELr(   r+   c            	       p     e Zd ZdZd	 fd	Zd
dZ	 ddej                  dededej                  fdZ	 xZ
S )DPTViTHybridEmbeddingsz
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                 b   t         
|           |j                  |j                  }}|j                  |j
                  }}t        |t        j                  j                        r|n||f}t        |t        j                  j                        r|n||f}|d   |d   z  |d   |d   z  z  }t        |      | _        | j                  j                  d   }t        | j                  j                        dk7  r+t        dt        | j                  j                               ddg| _        ||j                   }	|	dd  }|	d   }nCt        |t        j                  j                        r|n||f}| j                  j                  d   }|| _        |d   | _        || _        t#        j$                  ||d      | _        t#        j(                  t+        j,                  dd|j
                              | _        t#        j(                  t+        j,                  d|dz   |j
                              | _        y )Nr   r   r	   z1Expected backbone to have 3 output features, got kernel_size)super__init__
image_size
patch_sizenum_channelshidden_size
isinstancecollectionsabcIterabler   backbonechannelslen
ValueErrorresidual_feature_map_indexbackbone_featmap_shaper   Conv2d
projection	Parameterr#   zeros	cls_tokenposition_embeddings)selfconfigfeature_sizer9   r:   r;   r<   num_patchesfeature_dimfeat_map_shape	__class__s             r)   r8   zDPTViTHybridEmbeddings.__init___   s   !'!2!2F4E4EJ
$*$7$79K9Kk#-j+//:R:R#SZZdfpYq
#-j+//:R:R#SZZdfpYq
!!}
15*Q-:VW=:XY%f-mm,,R0t}}%%&!+PQTUYUbUbUkUkQlPmnoo+,a&'#::N)"#.L(+K !+<9Q9Q RYegsXt  --004K$$Q-())K!Lekk!Q8J8J&KL#%<<A{QPVPbPb0c#d r(   c                 r   |d d d |f   }|d|d f   }t        t        |      dz        }|j                  d||d      j                  dddd      }t        j
                  j                  |||fd      }|j                  dddd      j                  d||z  d      }t        j                  ||gd	      }|S 
Nr         ?r   r3   r	      bilinear)sizemodedim)	r   rC   reshapepermuter   
functionalinterpolater#   catrM   posembgrid_size_heightgrid_size_widthstart_index
posemb_tokposemb_gridold_grid_sizes           r)   _resize_pos_embedz(DPTViTHybridEmbeddings._resize_pos_embed   s    A||O,
Q_-!#k"2c"9:!))!]M2NVVWXZ[]^`abmm//CSUdBelv/w!))!Q15==aAQTcAceghJ4!<r(   pixel_valuesinterpolate_pos_encodingreturn_dictreturnc                    |j                   \  }}}}|| j                  k7  rt        d      |sV|| j                  d   k7  s|| j                  d   k7  r2t        d| d| d| j                  d    d| j                  d    d	      | j	                  | j
                  || j                  z  || j                  z        }| j                  |      }	|	j                  d   }
| j                  D cg c]  }|	j                  |    }}| j                  |
      j                  d	      j                  dd	      }| j                  j                  |dd      }t        j                   ||fd
      }||z   }|s||fS t#        ||      S c c}w )NeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r   r   zInput image size (*z) doesn't match model (z).r3   rW   r[   )r   r   )shaper;   rD   r9   rj   rL   r:   rA   feature_mapsrE   rH   flatten	transposerK   expandr#   ra   r   )rM   rk   rl   rm   
batch_sizer;   heightwidthrL   backbone_outputfeaturesindexoutput_hidden_states
embeddings
cls_tokenss                  r)   forwardzDPTViTHybridEmbeddings.forward   s    3?2D2D/
L&%4,,,w  (++u8J/J (% 9+,Adooa.@-AE 
 #44$$f&?$//AY
 --5"//3 RVQpQpq < <U Cqq__X.66q9CCAqI
^^**:r2>
YY
J7Q?
  "55
 455 :)%9
 	
  rs   )E?Nr   )FF)r   r    r!   r"   r8   rj   r#   Tensorboolr   __classcell__rS   s   @r)   r1   r1   X   sH     eD gl)
!LL)
DH)
_c)
	)
r(   r1   c                   2     e Zd ZdZ fdZddZddZ xZS )DPTViTEmbeddingszB
    Construct the CLS token, position and patch embeddings.

    c                    t         |           t        j                  t	        j
                  dd|j                              | _        t        |      | _	        | j                  j                  }t        j                  t	        j
                  d|dz   |j                              | _        t        j                  |j                        | _        || _        y )Nr   )r7   r8   r   rI   r#   rJ   r<   rK   DPTViTPatchEmbeddingspatch_embeddingsrP   rL   Dropouthidden_dropout_probdropoutrN   )rM   rN   rP   rS   s      r)   r8   zDPTViTEmbeddings.__init__   s    ekk!Q8J8J&KL 5f =++77#%<<A{QPVPbPb0c#d zz&"<"<=r(   c                 ~   |d d d |f   }|d|d f   }t        |j                  d      dz        }|j                  d||d      j                  dddd      }t        j
                  j                  |||fd      }|j                  dddd      j                  d||z  d      }t        j                  ||gd	      }|S rU   )	r   rY   r]   r^   r   r_   r`   r#   ra   rb   s           r)   rj   z"DPTViTEmbeddings._resize_pos_embed   s    A||O,
Q_-!+"2"21"5"<=!))!]M2NVVWXZ[]^`abmm//CSUdBelv/w!))!Q15==aAQTcAceghJ4!<r(   c                    |j                   \  }}}}| j                  j                  }| j                  | j                  ||z  ||z        }| j                  |      }	|	j                         \  }}
}| j                  j                  |dd      }t        j                  ||	fd      }	|	|z   }	| j                  |	      }	|s|	fS t        |	      S )Nr3   r   r[   )r   )rr   rN   r:   rj   rL   r   rY   rK   rv   r#   ra   r   r   )rM   rk   rm   rw   r;   rx   ry   r:   rL   r~   seq_len_r   s                r)   r   zDPTViTEmbeddings.forward   s    2>2D2D/
L&% [[++
"44$$f
&:EZ<O
 **<8
!+!2
GQ ^^**:r2>
YY
J7Q?
  "55
\\*-
= 9ZXXr(   r   )F)r   r    r!   r"   r8   rj   r   r   r   s   @r)   r   r      s    
Yr(   r   c                   (     e Zd ZdZ fdZd Z xZS )r   z$
    Image to Patch Embedding.

    c                    t         |           |j                  |j                  }}|j                  |j
                  }}t        |t        j                  j                        r|n||f}t        |t        j                  j                        r|n||f}|d   |d   z  |d   |d   z  z  }|| _        || _        || _        || _
        t        j                  ||||      | _        y )Nr   r   )r6   stride)r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rP   r   rG   rH   )rM   rN   r9   r:   r;   r<   rP   rS   s          r)   r8   zDPTViTPatchEmbeddings.__init__   s    !'!2!2F4E4EJ
$*$7$79K9Kk#-j+//:R:R#SZZdfpYq
#-j+//:R:R#SZZdfpYq
!!}
15*Q-:VW=:XY$$(&))L+:^hir(   c                     |j                   \  }}}}|| j                  k7  rt        d      | j                  |      j	                  d      j                  dd      }|S )Nrp   rW   r   )rr   r;   rD   rH   rt   ru   )rM   rk   rw   r;   rx   ry   r~   s          r)   r   zDPTViTPatchEmbeddings.forward
  sb    2>2D2D/
L&%4,,,w  __\2::1=GG1M
r(   r   r    r!   r"   r8   r   r   r   s   @r)   r   r      s    
jr(   r   modulequerykeyvalueattention_maskscalingr   c                    t        j                  ||j                  dd            |z  }t        j                  j                  |dt         j                        j                  |j                        }t        j                  j                  ||| j                        }|||z  }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )Nr3   r4   )r\   dtype)ptrainingr   rW   )r#   matmulru   r   r_   softmaxfloat32tor   r   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputs
             r)   eager_attention_forwardr     s     <<s}}R'<=GL ==((2U]](SVVW\WbWbcL ==((6??([L !#n4,,|U3K''1-88:K$$r(   c            
            e Zd Zdeddf fdZ	 	 ddeej                     dede	e
ej                  ej                  f   e
ej                     f   fdZ xZS )	DPTSelfAttentionrN   rn   Nc                 2   t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      || _        |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _	        |j                  | _        | j                  dz  | _        d| _        t        j                  |j                  | j                  |j                         | _        t        j                  |j                  | j                  |j                         | _        t        j                  |j                  | j                  |j                         | _        y )	Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads .g      F)bias)r7   r8   r<   num_attention_headshasattrrD   rN   intattention_head_sizeall_head_sizeattention_probs_dropout_probdropout_probr   	is_causalr   Linearqkv_biasr   r   r   rM   rN   rS   s     r)   r8   zDPTSelfAttention.__init__5  sF    : ::a?PVXhHi"6#5#5"6 7334A7 
 #)#=#= #&v'9'9F<V<V'V#W !558P8PP"??//5YYv1143E3EFOO\
99V//1C1C&//ZYYv1143E3EFOO\
r(   	head_maskoutput_attentionsc           
         |j                   \  }}}| j                  |      j                  |d| j                  | j                        j                  dd      }| j                  |      j                  |d| j                  | j                        j                  dd      }| j                  |      j                  |d| j                  | j                        j                  dd      }	t        }
| j                  j                  dk7  rN| j                  j                  dk(  r|rt        j                  d       nt        | j                  j                     }
 |
| |	|||| j                  | j                  | j                   sdn| j"                        \  }}|j%                         d d	 | j&                  fz   }|j)                  |      }|r||f}|S |f}|S )
Nr3   r   rW   eagersdpaz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.        )r   r   r   r4   )rr   r   viewr   r   ru   r   r   r   rN   _attn_implementationloggerwarning_oncer   r   r   r   r   rY   r   r]   )rM   r.   r   r   rw   
seq_lengthr   	key_layervalue_layerquery_layerattention_interfacecontext_layerattention_probsnew_context_layer_shapeoutputss                  r)   r   zDPTSelfAttention.forwardI  s    %2$7$7!
JHH]#T*b$":":D<T<TUYq!_ 	 JJ}%T*b$":":D<T<TUYq!_ 	 JJ}%T*b$":":D<T<TUYq!_ 	 )@;;++w6{{//69>O##L
 '>dkk>^>^&_#)<nnLL#}}C$2C2C	*
& #0"4"4"6s";t?Q?Q>S"S%--.EF6G=/2 O\M]r(   NF)r   r    r!   r   r8   r   r#   r   r   r   r&   r   r   r   s   @r)   r   r   4  sr    ]y ]T ]. -1"'	1 ELL)1  	1
 
uU\\5<</0%2EE	F1r(   r   c                   |     e Zd ZdZdeddf fdZdej                  dej                  dej                  fdZ xZ	S )	DPTViTSelfOutputz
    The residual connection is defined in DPTLayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    rN   rn   Nc                     t         |           t        j                  |j                  |j                        | _        t        j                  |j                        | _        y r   )	r7   r8   r   r   r<   denser   r   r   r   s     r)   r8   zDPTViTSelfOutput.__init__  sB    YYv1163E3EF
zz&"<"<=r(   r.   input_tensorc                 J    | j                  |      }| j                  |      }|S r   r   r   rM   r.   r   s      r)   r   zDPTViTSelfOutput.forward  s$    

=1]3r(   )
r   r    r!   r"   r   r8   r#   r   r   r   r   s   @r)   r   r   ~  sD    
>y >T >
U\\  RWR^R^ r(   r   c                        e Zd Zdeddf fdZdee   ddfdZ	 	 ddej                  de
ej                     d	edeeej                  ej                  f   eej                     f   fd
Z xZS )DPTViTAttentionrN   rn   Nc                     t         |           t        |      | _        t	        |      | _        t               | _        y r   )r7   r8   r   	attentionr   outputsetpruned_headsr   s     r)   r8   zDPTViTAttention.__init__  s0    )&1&v.Er(   headsc                 >   t        |      dk(  ry t        || j                  j                  | j                  j                  | j
                        \  }}t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _	        t        | j                  j                  |d      | j                  _        | j                  j                  t        |      z
  | j                  _        | j                  j                  | j                  j                  z  | j                  _        | j
                  j                  |      | _        y )Nr   r   r[   )rC   r   r   r   r   r   r   r   r   r   r   r   r   union)rM   r   r|   s      r)   prune_headszDPTViTAttention.prune_heads  s   u:?74>>55t~~7Y7Y[_[l[l
u
  2$..2F2FN/0B0BEJ1$..2F2FN.t{{/@/@%QO .2^^-O-ORUV[R\-\*'+~~'I'IDNNLnLn'n$ --33E:r(   r.   r   r   c                 h    | j                  |||      }| j                  |d   |      }|f|dd  z   }|S )Nr   r   )r   r   )rM   r.   r   r   self_outputsattention_outputr   s          r)   r   zDPTViTAttention.forward  sE     ~~mY@QR;;|AF#%QR(88r(   r   )r   r    r!   r   r8   r   r   r   r#   r   r   r   r   r&   r   r   r   s   @r)   r   r     s    "y "T ";S ;d ;, -1"'	|| ELL)  	
 
uU\\5<</0%2EE	Fr(   r   c                   `     e Zd Zdeddf fdZdej                  dej                  fdZ xZS )DPTViTIntermediaterN   rn   Nc                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r   )r7   r8   r   r   r<   intermediate_sizer   r=   
hidden_actstrr
   intermediate_act_fnr   s     r)   r8   zDPTViTIntermediate.__init__  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r(   r.   c                 J    | j                  |      }| j                  |      }|S r   )r   r   )rM   r.   s     r)   r   zDPTViTIntermediate.forward  s&    

=100?r(   	r   r    r!   r   r8   r#   r   r   r   r   s   @r)   r   r     s1    9y 9T 9U\\ ell r(   r   c                   x     e Zd Zdeddf fdZdej                  dej                  dej                  fdZ xZS )DPTViTOutputrN   rn   Nc                     t         |           t        j                  |j                  |j
                        | _        t        j                  |j                        | _	        y r   )
r7   r8   r   r   r   r<   r   r   r   r   r   s     r)   r8   zDPTViTOutput.__init__  sB    YYv779K9KL
zz&"<"<=r(   r.   r   c                 T    | j                  |      }| j                  |      }||z   }|S r   r   r   s      r)   r   zDPTViTOutput.forward  s.    

=1]3%4r(   r   r   s   @r)   r   r     s?    >y >T >
U\\  RWR^R^ r(   r   c                        e Zd ZdZdeddf fdZ	 	 d
dej                  deej                     de	de
eej                  ej                  f   eej                     f   fd	Z xZS )DPTViTLayerz?This corresponds to the Block class in the timm implementation.rN   rn   Nc                 r   t         |           |j                  | _        d| _        t	        |      | _        t        |      | _        t        |      | _	        t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        y )Nr   eps)r7   r8   chunk_size_feed_forwardseq_len_dimr   r   r   intermediater   r   r   	LayerNormr<   layer_norm_epslayernorm_beforelayernorm_afterr   s     r)   r8   zDPTViTLayer.__init__  s    '-'E'E$(0.v6"6* "V-?-?VEZEZ [!||F,>,>FDYDYZr(   r.   r   r   c                     | j                  | j                  |      ||      }|d   }|dd  }||z   }| j                  |      }| j                  |      }| j	                  ||      }|f|z   }|S )N)r   r   r   )r   r   r   r   r   )rM   r.   r   r   self_attention_outputsr   r   layer_outputs           r)   r   zDPTViTLayer.forward  s     "&!!-0/ "0 "

 2!4(, )=8 ++M:((6 {{<?/G+r(   r   )r   r    r!   r"   r   r8   r#   r   r   r   r   r&   r   r   r   s   @r)   r   r     s    I[y [T [ -1"'	|| ELL)  	
 
uU\\5<</0%2EE	Fr(   r   c                        e Zd Zdeddf fdZ	 	 	 	 ddej                  deej                     deded	ede	e
ef   fd
Z xZS )DPTViTEncoderrN   rn   Nc                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w r   )
r7   r8   rN   r   
ModuleListrangenum_hidden_layersr   layergradient_checkpointingrM   rN   r   rS   s      r)   r8   zDPTViTEncoder.__init__  sN    ]]vG_G_A`#aAK$7#ab
&+# $bs   A#r.   r   r   r}   rm   c                    |rdnd }|rdnd }t        | j                        D ]1  \  }}	|r||fz   }|||   nd }
 |	||
|      }|d   }|s)||d   fz   }3 |r||fz   }|st        d |||fD              S t        |||      S )Nr'   r   r   c              3   &   K   | ]	  }||  y wr   r'   ).0vs     r)   	<genexpr>z(DPTViTEncoder.forward.<locals>.<genexpr>)  s     mq_`_lms   )r,   r.   r/   )	enumerater  r&   r   )rM   r.   r   r   r}   rm   all_hidden_statesall_self_attentionsilayer_modulelayer_head_masklayer_outputss               r)   r   zDPTViTEncoder.forward  s     #7BD$5b4(4 	POA|#$58H$H!.7.CilO(IZ[M)!,M &9]1=M<O&O#	P   1]4D Dm]4EGZ$[mmm++*
 	
r(   )NFFT)r   r    r!   r   r8   r#   r   r   r   r   r&   r   r   r   r   s   @r)   r  r    sz    ,y ,T , -1"'%* !
||!
 ELL)!
  	!

 #!
 !
 
uo%	&!
r(   r  c                   t     e Zd ZdZ fdZd Zd Zddeej                     deej                     fdZ
 xZS )	DPTReassembleStagea@  
    This class reassembles the hidden states of the backbone into image-like feature representations at various
    resolutions.

    This happens in 3 stages:
    1. Map the N + 1 tokens to a set of N tokens, by taking into account the readout ([CLS]) token according to
       `config.readout_type`.
    2. Project the channel dimension of the hidden states according to `config.neck_hidden_sizes`.
    3. Resizing the spatial dimensions (height, width).

    Args:
        config (`[DPTConfig]`):
            Model configuration class defining the model architecture.
    c                     t         |           || _        t        j                         | _        |j                  r| j                  |       n| j                  |       |j                  | _	        y r   )
r7   r8   rN   r   r  layers	is_hybrid_init_reassemble_dpt_hybrid_init_reassemble_dptneck_ignore_stagesr   s     r)   r8   zDPTReassembleStage.__init__A  sU    mmo,,V4%%f-"(";";r(   c           	      v   t        t        t        |j                              |j                        D ]r  \  }}|dk  r.| j
                  j                  t        j                                9|dkD  s?| j
                  j                  t        ||j                  |   |             t |j                  dk7  rt        d|j                   d      t        j                         | _        t        |      }t        t        |j                              D ]  }|dk  rA| j                  j                  t        j                  t        j                                      I|dkD  sO| j                  j                  t        j                  t        j                   d|z  |      t"        |j$                                   y)a   "
        For DPT-Hybrid the first 2 reassemble layers are set to `nn.Identity()`, please check the official
        implementation: https://github.com/isl-org/DPT/blob/f43ef9e08d70a752195028a51be5e1aff227b913/dpt/vit.py#L438
        for more details.
        r   rB   factorprojectzReadout type z! is not supported for DPT-Hybrid.rW   N)zipr  rC   neck_hidden_sizesreassemble_factorsr  appendr   IdentityDPTReassembleLayerreadout_typerD   r  readout_projects_get_backbone_hidden_size
Sequentialr   r
   r   )rM   rN   r  r!  r<   s        r)   r  z.DPTReassembleStage._init_reassemble_dpt_hybridM  sX    U3v'?'?#@A6C\C\] 	tIAvAv""2;;=1Q""#5fvG_G_`aGbkq#rs		t )+}V-@-@,AAbcdd !#/7s63345 	AAv%%,,R]]2;;=-IJQ%%,,MM"))AO["I6RXRcRcKde		r(   c           	      <   t        t        t        |j                              |j                        D ]9  \  }}| j
                  j                  t        ||j                  |   |             ; |j                  dk(  rt        j                         | _        t        |      }t        t        |j                              D ]Y  }| j                  j                  t        j                  t        j                  d|z  |      t        |j                                   [ y y )Nr   r"  rW   )r#  r  rC   r$  r%  r  r&  r(  r)  r   r  r*  r+  r,  r   r
   r   )rM   rN   r  r!  r<   r   s         r)   r  z'DPTReassembleStage._init_reassemble_dptg  s    U3v'?'?#@A6C\C\] 	pIAvKK1&6C[C[\]C^gmno	p )+$&MMOD!3F;K3v7789 %%,,MM"))AO["I6RXRcRcKde ,r(   r.   rn   c                    g }t        |      D ]  \  }}|| j                  vr|dddf   |ddddf   }}|j                  \  }}	}
|||j                  ||||
      }n"t	        |	dz        }|j                  ||||
      }|j                  dddd      j                         }|j                  }| j                  j                  dk(  r|j                  d      j                  d      }|j                  d      j                  |      } | j                  |   t        j                  ||fd	            }|j                  ddd      j                  |      }nM| j                  j                  d
k(  r4|j                  d      |j                  d	      z   }|j                  |      } | j                  |   |      }|j!                  |        |S )z
        Args:
            hidden_states (`list[torch.FloatTensor]`, each of shape `(batch_size, sequence_length + 1, hidden_size)`):
                List of hidden states from the backbone.
        Nr   r   rV   r	   rW   r"  )r   rW   r   r3   add)r  r  rr   r]   r   r^   r   rN   r)  rt   	unsqueeze	expand_asr*  r#   ra   r  r&  )rM   r.   patch_heightpatch_widthoutr  hidden_staterK   rw   sequence_lengthr;   rY   feature_shapereadouts                 r)   r   zDPTReassembleStage.forwards  s    (7 	%OA|///*6q!t*<l1ab5>Q<	<H<N<N9
O\+0G#/#7#7
LR]_k#lL$_c%9:D#/#7#7
D$P\#]L+33Aq!Q?JJL , 2 2;;++y8#/#7#7#:#B#B9#ML'11!4>>|LG#;4#8#8#;EII|U\F]_a<b#cL#/#7#71a#@#H#H#WL[[--6#/#7#7#:Y=P=PQS=T#TL#/#7#7#FL-t{{1~l;JJ|$3	%6 
r(   NN)r   r    r!   r"   r8   r  r  listr#   r   r   r   r   s   @r)   r  r  1  s@    
<4
#T%,,%7 #aefkfrfras #r(   r  c                 z    | j                   $| j                  du r| j                   j                  S | j                  S r   )backbone_configr  r<   )rN   s    r)   r+  r+    s9    )f.>.>%.G%%111!!!r(   c                   $     e Zd Z fdZd Z xZS )r(  c           	      \   t         |           t        |      }t        j                  ||d      | _        |dkD  r t        j                  ||||d      | _        y |dk(  rt        j                         | _        y |dk  r,t        j                  ||dt        d|z        d      | _        y y )Nr   )in_channelsout_channelsr6   r   r6   r   paddingr	   )
r7   r8   r+  r   rG   rH   ConvTranspose2dresizer'  r   )rM   rN   rB   r!  r<   rS   s        r)   r8   zDPTReassembleLayer.__init__  s    /7))(`ab A:,,XxV\blmnDKq[++-DKaZ))HhAcRSV\R\oghiDK r(   c                 J    | j                  |      }| j                  |      }|S r   )rH   rD  )rM   r5  s     r)   r   zDPTReassembleLayer.forward  s$    |4{{<0r(   r   r    r!   r8   r   r   r   s   @r)   r(  r(    s    jr(   r(  c                   $     e Zd Z fdZd Z xZS )DPTFeatureFusionStagec                     t         |           t        j                         | _        t        t        |j                              D ]&  }| j                  j                  t        |             ( y r   )
r7   r8   r   r  r  r  rC   r$  r&  DPTFeatureFusionLayerr
  s      r)   r8   zDPTFeatureFusionStage.__init__  sR    mmos63345 	>AKK4V<=	>r(   c                     |d d d   }g }d }t        || j                        D ]*  \  }}|	 ||      }n	 |||      }|j                  |       , |S )Nr3   )r#  r  r&  )rM   r.   fused_hidden_statesfused_hidden_stater5  r  s         r)   r   zDPTFeatureFusionStage.forward  sq    %dd+ !#&}dkk#B 	;L%!)%*<%8"%*+=|%L"&&'9:	; #"r(   rF  r   s   @r)   rH  rH    s    >#r(   rH  c                   Z     e Zd ZdZ fdZdej                  dej                  fdZ xZS )DPTPreActResidualLayerz
    ResidualConvUnit, pre-activate residual unit.

    Args:
        config (`[DPTConfig]`):
            Model configuration class defining the model architecture.
    c                 l   t         |           |j                  | _        |j                  |j                  n| j                   }t        j                         | _        t        j                  |j                  |j                  ddd|      | _
        t        j                         | _        t        j                  |j                  |j                  ddd|      | _        | j                  rIt        j                  |j                        | _        t        j                  |j                        | _        y y )Nr	   r   )r6   r   rB  r   )r7   r8   !use_batch_norm_in_fusion_residualuse_batch_normuse_bias_in_fusion_residualr   ReLUactivation1rG   fusion_hidden_sizeconvolution1activation2convolution2BatchNorm2dbatch_norm1batch_norm2)rM   rN   rS  rS   s      r)   r8   zDPTPreActResidualLayer.__init__  s   $FF 11= ..((( 	$ 779II%%%%,
 779II%%%%,
 !~~f.G.GHD!~~f.G.GHD r(   r5  rn   c                    |}| j                  |      }| j                  |      }| j                  r| j                  |      }| j	                  |      }| j                  |      }| j                  r| j                  |      }||z   S r   )rU  rW  rR  r[  rX  rY  r\  rM   r5  residuals      r)   r   zDPTPreActResidualLayer.forward  s    ''5((6++L9L''5((6++L9Lh&&r(   )	r   r    r!   r"   r8   r#   r   r   r   r   s   @r)   rO  rO    s*     ID'ELL 'U\\ 'r(   rO  c                   ,     e Zd ZdZd fd	ZddZ xZS )rJ  a3  Feature fusion layer, merges feature maps from different stages.

    Args:
        config (`[DPTConfig]`):
            Model configuration class defining the model architecture.
        align_corners (`bool`, *optional*, defaults to `True`):
            The align_corner setting for bilinear upsample.
    c                     t         |           || _        t        j                  |j
                  |j
                  dd      | _        t        |      | _        t        |      | _	        y )Nr   T)r6   r   )
r7   r8   align_cornersr   rG   rV  rH   rO  residual_layer1residual_layer2)rM   rN   rb  rS   s      r)   r8   zDPTFeatureFusionLayer.__init__  sT    *))F$=$=v?X?Xfgnrs5f=5f=r(   c                    |l|j                   |j                   k7  r?t        j                  j                  ||j                   d   |j                   d   fdd      }|| j	                  |      z   }| j                  |      }t        j                  j                  |dd| j                        }| j                  |      }|S )NrW   r	   rX   FrY   rZ   rb  scale_factorrZ   rb  )rr   r   r_   r`   rc  rd  rb  rH   r^  s      r)   r   zDPTFeatureFusionLayer.forward  s    !!X^^3==44L$6$6q$9<;M;Ma;P#QXbrw 5  ($*>*>x*HHL++L9}}00qzI[I[ 1 
 |4r(   Tr   r   r   s   @r)   rJ  rJ    s    >r(   rJ  c                   :    e Zd ZU eed<   dZdZdZdZdZ	dZ
dZd Zy)DPTPreTrainedModelrN   dptrk   Tc                    t        |t        j                  t        j                  t        j                  f      rl|j
                  j                  j                  d| j                  j                         |j                  |j                  j                  j                          nst        |t        j                  t        j                  f      rI|j                  j                  j                          |j
                  j                  j                  d       t        |t        t         f      rI|j"                  j                  j                          |j$                  j                  j                          yy)zInitialize the weightsr   )meanstdNg      ?)r=   r   r   rG   rC  weightdatanormal_rN   initializer_ranger   zero_r   rZ  fill_r   r1   rK   rL   )rM   r   s     r)   _init_weightsz DPTPreTrainedModel._init_weights;  s    fryy"))R5G5GHI MM&&CT[[5R5R&S{{&  &&(r~~ >?KK""$MM$$S)f/1GHI!!'')&&++113 Jr(   N)r   r    r!   r   r%   base_model_prefixmain_input_namesupports_gradient_checkpointing_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backendrv  r'   r(   r)   rk  rk  0  s5    $O&*#N"&4r(   rk  c                        e Zd Zd fd	Zd Zd Ze	 	 	 	 ddej                  de	ej                     de	e
   de	e
   de	e
   d	eeef   fd
       Z xZS )DPTModelc                 T   t         |   |       || _        |j                  rt	        |      | _        nt        |      | _        t        |      | _        t        j                  |j                  |j                        | _        |rt        |      nd| _        | j!                          y)zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        r   N)r7   r8   rN   r  r1   r~   r   r  encoderr   r   r<   r   	layernormDPTViTPoolerpooler	post_init)rM   rN   add_pooling_layerrS   s      r)   r8   zDPTModel.__init__M  s    
 	  4V<DO.v6DO$V,f&8&8f>S>ST.?l6*T 	r(   c                 r    | j                   j                  r| j                  S | j                  j                  S r   )rN   r  r~   r   )rM   s    r)   get_input_embeddingszDPTModel.get_input_embeddingsb  s)    ;;  ??"??333r(   c                     |j                         D ]7  \  }}| j                  j                  |   j                  j	                  |       9 y)z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr  r  r   r   )rM   heads_to_pruner  r   s       r)   _prune_headszDPTModel._prune_headsh  sE    
 +002 	CLE5LLu%//;;EB	Cr(   rk   r   r   r}   rm   rn   c                 Z   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j	                  || j                   j
                        }| j                  ||      }|s|d   n|j                  }| j                  |||||      }|d   }	| j                  |	      }	| j                  | j                  |	      nd }
|s|
|	|
fn|	f}||dd  z   |dd  z   S t        |	|
|j                  |j                  |j                        S )N)rm   r   r   r   r}   rm   r   )r,   r-   r.   r/   r   )rN   r   r}   use_return_dictget_head_maskr  r~   r   r  r  r  r+   r.   r/   r   )rM   rk   r   r   r}   rm   embedding_outputembedding_last_hidden_statesencoder_outputssequence_outputpooled_outputhead_outputss               r)   r   zDPTModel.forwardp  sU    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] &&y$++2O2OP	??<[?QBM'7':ScSvSv$,,(/!5# ' 
 *!,..98<8OO4UY?L?XO];_n^pL/!""558H8LLLC-')77&11%5%N%N
 	
r(   ri  )NNNN)r   r    r!   r8   r  r  r   r#   r$   r   r   r   r&   r+   r   r   r   s   @r)   r  r  K  s    *4C  26,0/3&*/
''/
 E--./
 $D>	/

 'tn/
 d^/
 
uJJ	K/
 /
r(   r  c                   *     e Zd Zdef fdZd Z xZS )r  rN   c                     t         |           t        j                  |j                  |j
                        | _        t        |j                     | _	        y r   )
r7   r8   r   r   r<   pooler_output_sizer   r
   
pooler_act
activationr   s     r)   r8   zDPTViTPooler.__init__  s>    YYv1163L3LM
 !2!23r(   c                 \    |d d df   }| j                  |      }| j                  |      }|S )Nr   )r   r  )rM   r.   first_token_tensorr  s       r)   r   zDPTViTPooler.forward  s6     +1a40

#566r(   )r   r    r!   r   r8   r   r   r   s   @r)   r  r    s    4y 4
r(   r  c                   h     e Zd ZdZ fdZddeej                     deej                     fdZ xZ	S )DPTNecka;  
    DPTNeck. A neck is a module that is normally used between the backbone and the head. It takes a list of tensors as
    input and produces another list of tensors as output. For DPT, it includes 2 stages:

    * DPTReassembleStage
    * DPTFeatureFusionStage.

    Args:
        config (dict): config dict.
    c           
         t         |           || _        |j                   |j                  j                  dv rd | _        nt        |      | _        t        j                         | _	        |j                  D ]?  }| j                  j                  t        j                  ||j                  ddd             A t        |      | _        y )N)swinv2r	   r   Fr6   rB  r   )r7   r8   rN   r<  
model_typereassemble_stager  r   r  convsr$  r&  rG   rV  rH  fusion_stage)rM   rN   channelrS   s      r)   r8   zDPTNeck.__init__  s     !!-&2H2H2S2SWa2a$(D!$6v$>D!]]_
// 	sGJJbii1J1JXYcdkpqr	s 2&9r(   r.   rn   c                    t        |t        t        f      st        d      t	        |      t	        | j
                  j                        k7  rt        d      | j                  | j                  |||      }t        |      D cg c]  \  }} | j                  |   |       }}}| j                  |      }|S c c}}w )z
        Args:
            hidden_states (`list[torch.FloatTensor]`, each of shape `(batch_size, sequence_length, hidden_size)` or `(batch_size, hidden_size, height, width)`):
                List of hidden states from the backbone.
        z2hidden_states should be a tuple or list of tensorszOThe number of hidden states should be equal to the number of neck hidden sizes.)r=   r&   r:  	TypeErrorrC   rN   r$  rD   r  r  r  r  )rM   r.   r2  r3  r  featurer{   r   s           r)   r   zDPTNeck.forward  s     -%7PQQ}T[[%B%B!CCnoo   , 11-{[M=F}=UVzq'MDJJqM'*VV ""8, Ws   B:r9  
r   r    r!   r"   r8   r:  r#   r   r   r   r   s   @r)   r  r    s6    	:"T%,,%7 aefkfrfras r(   r  c                   `     e Zd ZdZ fdZdeej                     dej                  fdZ xZ	S )DPTDepthEstimationHeada	  
    Output head consisting of 3 convolutional layers. It progressively halves the feature dimension and upsamples
    the predictions to the input resolution after the first convolutional layer (details can be found in the paper's
    supplementary material).
    c                    t         |           || _        d | _        |j                  rt        j                  ddddd      | _        |j                  }t        j                  t        j                  ||dz  ddd      t        j                  ddd	
      t        j                  |dz  dddd      t        j                         t        j                  ddddd      t        j                               | _        y )N   )r	   r	   )r   r   rA  rW   r	   r   rX   Trg      r   )r7   r8   rN   rH   add_projectionr   rG   rV  r,  UpsamplerT  headrM   rN   r{   rS   s      r)   r8   zDPTDepthEstimationHead.__init__  s       iiSfV]cdDO,,MMIIhA1QPQRKKQZtLIIh!mRQq!LGGIIIb!1a@GGI
	r(   r.   rn   c                     || j                   j                     }| j                  +| j                  |      } t        j                         |      }| j                  |      }|j                  d      }|S )Nr   r[   )rN   head_in_indexrH   r   rT  r  squeeze)rM   r.   predicted_depths      r)   r   zDPTDepthEstimationHead.forward  sg    %dkk&?&?@??& OOM:M%BGGIm4M))M2)11a18r(   r  r   s   @r)   r  r    s-    
&T%,,%7 ELL r(   r  zu
    DPT Model with a depth estimation head on top (consisting of 3 convolutional layers) e.g. for KITTI, NYUv2.
    c                        e Zd Z fdZe	 	 	 	 	 d
dej                  deej                     deej                     dee	   dee	   dee	   de
eej                     ef   fd	       Z xZS )DPTForDepthEstimationc                 $   t         |   |       d | _        |j                  du r)|j                  |j                  t        |      | _        nt        |d      | _        t        |      | _	        t        |      | _        | j                          y NF)r  )r7   r8   rA   r  r<  r   r  rl  r  neckr  r  r  r   s     r)   r8   zDPTForDepthEstimation.__init__  s}     u$&*@*@*LPVP_P_Pk)&1DM%@DH FO	 +62	 	r(   rk   r   labelsr   r}   rm   rn   c                 f    d}|t        d      ||n j                  j                  }||n j                  j                  }||n j                  j                  } j
                  + j
                  j                  |||      }|j                  }	n j                  |||d|      }|r|j                  n|d   }	 j                  j                  s:t        |	dd       D 
cg c]   \  }
}|
 j                  j                  v s|" }	}
}nD|r|j                  nt        |d         }|j                   fdt        |	dd       D               |}	d	\  }} j                  j                   S j                  j                  d
u r;|j"                  \  }}}} j                  j                   j$                  }||z  }||z  } j'                  |	||      }	 j)                  |	      }|s|r
|f|dd z   }n	|f|dd z   }||f|z   S |S t+        |||r|j                  nd|j,                        S c c}}
w )a  
        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth depth estimation maps for computing the loss.

        Examples:
        ```python
        >>> from transformers import AutoImageProcessor, DPTForDepthEstimation
        >>> import torch
        >>> import numpy as np
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("Intel/dpt-large")
        >>> model = DPTForDepthEstimation.from_pretrained("Intel/dpt-large")

        >>> # prepare image for the model
        >>> inputs = image_processor(images=image, return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs)

        >>> # interpolate to original size
        >>> post_processed_output = image_processor.post_process_depth_estimation(
        ...     outputs,
        ...     target_sizes=[(image.height, image.width)],
        ... )

        >>> # visualize the prediction
        >>> predicted_depth = post_processed_output[0]["predicted_depth"]
        >>> depth = predicted_depth * 255 / predicted_depth.max()
        >>> depth = depth.detach().cpu().numpy()
        >>> depth = Image.fromarray(depth.astype("uint8"))
        ```NzTraining is not implemented yet)r}   r   Tr  r   r3   c              3   ^   K   | ]$  \  }}|j                   j                  d d v r| & ywrW   NrN   backbone_out_indicesr  idxr  rM   s      r)   r  z0DPTForDepthEstimation.forward.<locals>.<genexpr>w  s6      .$Wdkk>>qrBB .s   *-r9  FrW   )lossr  r.   r/   )NotImplementedErrorrN   r  r}   r   rA   forward_with_filtered_kwargsrs   rl  r.   r  r  r  r   r:  extendr<  rr   r:   r  r  r   r/   )rM   rk   r   r  r   r}   rm   r  r   r.   r  r  backbone_hidden_statesr2  r3  r   rx   ry   r:   r  r   s   `                    r)   r   zDPTForDepthEstimation.forward)  st   \ %&GHH%0%<k$++B]B]$8$D $++JjJj 	 2C1N-TXT_T_TqTq==$mm@@3G[l A G $00Mhh#"3%)'  G 6AG11gajM ;;((09-:K0L! ,WPSW[WbWbWwWwPwG! ! NY)I)I^bcjkmcn^o&&-- .(1-2C(D.  !7$.!k;;&&2t{{7L7LPU7U"."4"4Aq&%44??J!Z/L:-K		-{K))M2#)+gabk9)+gabk9)-)9TGf$EvE#+3G'//T))	
 	
?!s   & H-H-)NNNNN)r   r    r!   r8   r   r#   r$   r   
LongTensorr   r   r&   r   r   r   r   r   s   @r)   r  r    s    $  26-1,0/3&*l
''l
 E--.l
 ))*	l

 $D>l
 'tnl
 d^l
 
uU\\"$88	9l
 l
r(   r  c                   \     e Zd Z fdZdeej                     dej                  fdZ xZS )DPTSemanticSegmentationHeadc                    t         |           || _        |j                  }t	        j
                  t	        j                  ||ddd      t	        j                  |      t	        j                         t	        j                  |j                        t	        j                  ||j                  d      t	        j                  ddd	            | _        y )
Nr	   r   Fr  r5   rW   rX   Trg  )r7   r8   rN   rV  r   r,  rG   rZ  rT  r   semantic_classifier_dropout
num_labelsr  r  r  s      r)   r8   z$DPTSemanticSegmentationHead.__init__  s    ,,MMIIhaONN8$GGIJJv99:IIh 1 1qAKKQZtL
	r(   r.   rn   c                 Z    || j                   j                     }| j                  |      }|S r   )rN   r  r  rM   r.   logitss      r)   r   z#DPTSemanticSegmentationHead.forward  s)    %dkk&?&?@=)r(   )	r   r    r!   r8   r:  r#   r   r   r   r   s   @r)   r  r    s(    
T%,,%7 ELL r(   r  c                   $     e Zd Z fdZd Z xZS )DPTAuxiliaryHeadc                 X   t         |           |j                  }t        j                  t        j
                  ||ddd      t        j                  |      t        j                         t        j                  dd      t        j
                  ||j                  d            | _
        y )Nr	   r   Fr  g?r5   )r7   r8   rV  r   r,  rG   rZ  rT  r   r  r  r  s      r)   r8   zDPTAuxiliaryHead.__init__  sv    ,,MMIIhaONN8$GGIJJsE"IIh 1 1qA
	r(   c                 (    | j                  |      }|S r   )r  r  s      r)   r   zDPTAuxiliaryHead.forward  s    =)r(   rF  r   s   @r)   r  r    s    

r(   r  c                        e Zd Z fdZe	 	 	 	 	 	 d
deej                     deej                     deej                     dee	   dee	   dee	   de
eej                     ef   fd	       Z xZS )DPTForSemanticSegmentationc                     t         |   |       t        |d      | _        t	        |      | _        t        |      | _        |j                  rt        |      nd | _
        | j                          y r  )r7   r8   r  rl  r  r  r  r  use_auxiliary_headr  auxiliary_headr  r   s     r)   r8   z#DPTForSemanticSegmentation.__init__  s^     Fe< FO	 07	:@:S:S.v6Y] 	r(   rk   r   r  r   r}   rm   rn   c                     ||n j                   j                  }||n j                   j                  }|$ j                   j                  dk(  rt	        d       j                  |||d|      }|r|j                  n|d   } j                   j                  s:t        |dd       D 	
cg c]   \  }	}
|	 j                   j                  v s|
" }}	}
nD|r|j                  nt        |d         }|j                   fdt        |dd       D               |} j                  |      } j                  |      }d} j                   j                  |d         }d}|t         j"                  j%                  ||j&                  d	d d
d      }|0t         j"                  j%                  ||j&                  d	d d
d      }t)         j                   j*                        } |||      } ||      }| j                   j,                  |z  z   }|s|r
|f|dd z   }n	|f|dd z   }||f|z   S |S t/        |||r|j                  nd|j0                        S c c}
}	w )a  
        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).

        Examples:
        ```python
        >>> from transformers import AutoImageProcessor, DPTForSemanticSegmentation
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("Intel/dpt-large-ade")
        >>> model = DPTForSemanticSegmentation.from_pretrained("Intel/dpt-large-ade")

        >>> inputs = image_processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> logits = outputs.logits
        ```Nr   z/The number of labels should be greater than oneTr  r3   c              3   `   K   | ]%  \  }}|j                   j                  d d v s"| ' ywr  r  r  s      r)   r  z5DPTForSemanticSegmentation.forward.<locals>.<genexpr>  s7      *(CCSWS^S^SsSstutvSwLw*s   #..)r.   r4   rX   Frf  )ignore_indexrW   )r  r  r.   r/   )rN   r  r}   r  rD   rl  r.   r  r  r  r   r:  r  r  r  r  r   r_   r`   rr   r   semantic_loss_ignore_indexauxiliary_loss_weightr   r/   )rM   rk   r   r  r   r}   rm   r   r.   r  r  r  r  auxiliary_logitsr  upsampled_logitsupsampled_auxiliary_logitsloss_fct	main_lossauxiliary_lossr   s   `                    r)   r   z"DPTForSemanticSegmentation.forward  s   @ &1%<k$++B]B]$8$D $++JjJj 	 $++"8"8A"=NOO((/!%#  
 2=--'!* {{$$,5mAB6G,H(CCSWS^S^SsSsLsM  JUW%E%EZ^_fgi_jZk"")) *,5mAB6G,H*  3M			>=)*#22=3DE!}}88V\\"#.Zu  9    +-/]]-F-F$6<<+<:]b .G .* (T[[5[5[\H !16:I%&@&INt{{@@>QQD# WQR[0 WQR[0)-)9TGf$EvE&3G'//T))	
 	
Us   , II)NNNNNN)r   r    r!   r8   r   r   r#   r$   r  r   r   r&   r   r   r   r   r   s   @r)   r  r    s      5915-1,0/3&*c
u001c
 E--.c
 ))*	c

 $D>c
 'tnc
 d^c
 
uU\\"$;;	<c
 c
r(   r  )r  r  r  rk  )r   )Hr"   collections.abcr>   dataclassesr   typingr   r   r   r#   torch.utils.checkpointr   torch.nnr   activationsr
   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   r   pytorch_utilsr   r   utilsr   r   r   r   utils.backbone_utilsr   configuration_dptr   
get_loggerr   r   r   r+   Moduler1   r   r   r   floatr   r   r   r   r   r   r   r  r  r+  r(  rH  rO  rJ  rk  r  r  r  r  r  r  r  r  __all__r'   r(   r)   <module>r     s    ! , ,    % ! 9 ^ ^ F Q D D 1 ( 
		H	% 	M 	M 	M M; M M$`
RYY `
F7Yryy 7YtBII N %II%<<% 
% <<	%
 U\\*% % %>Fryy FTryy $&bii &T "299  ', 'V(
BII (
Ve eP" ,#BII #0:'RYY :'z"BII "J 4 4 44 T
! T
 T
p299 2bii 2j&RYY &R 
@
. @

@
F")) 2ryy & t
!3 t
 t
n dr(   