
    rh              	          d Z ddlZddlmZ ddlmZmZ ddlZddl	Zddlm
Z
 ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZmZ ddlmZ  ej6                  e      Ze ed       G d de                    Ze ed       G d de                    Z G d de
j@                        Z! G d de
j@                        Z" G d de
j@                        Z# G d de
j@                        Z$d>dejJ                  de&de'd ejJ                  fd!Z( G d" d#e
j@                        Z) G d$ d%e      Z* G d& d'e
j@                        Z+ G d( d)e
j@                        Z, G d* d+e
j@                        Z- G d, d-e
j@                        Z.e G d. d/e             Z/e G d0 d1e/             Z0d2ejJ                  d3e1d ejJ                  fd4Z2d2ejJ                  d5e1d6e1d ejJ                  fd7Z3 G d8 d9e
j@                        Z4 ed:       G d; d<e/             Z5g d=Z6y)?zPyTorch SegGpt model.    N)	dataclass)OptionalUnion)nn)
functional   )ACT2FN)GradientCheckpointingLayer)PreTrainedModel)ModelOutputauto_docstringlogging	torch_int   )SegGptConfigz1
    Output type of [`SegGptEncoderOutput`].
    )custom_introc                       e Zd ZU dZej
                  ed<   dZee	ej
                        ed<   dZ
ee	ej
                        ed<   dZee	ej
                        ed<   y)SegGptEncoderOutputay  
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, patch_height, patch_width, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model.
    hidden_states (`tuple[torch.FloatTensor]`, `optional`, returned when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
        of shape `(batch_size, patch_height, patch_width, hidden_size)`.
    attentions (`tuple[torch.FloatTensor]`, `optional`, returned when `config.output_attentions=True`):
        Tuple of *torch.FloatTensor* (one for each layer) of shape
        `(batch_size, num_heads, seq_len, seq_len)`.
    intermediate_hidden_states (`tuple[torch.FloatTensor]`, *optional*, returned when `config.intermediate_hidden_state_indices` is set):
        Tuple of `torch.FloatTensor` of shape `(batch_size, patch_height, patch_width, hidden_size)`.
        Each element in the Tuple corresponds to the output of the layer specified in `config.intermediate_hidden_state_indices`.
        Additionally, each feature passes through a LayerNorm.
    last_hidden_stateNhidden_states
attentionsintermediate_hidden_states)__name__
__module____qualname____doc__torchFloatTensor__annotations__r   r   tupler   r        }/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/seggpt/modeling_seggpt.pyr   r   $   sd     (((8<M8E%"3"345<59Ju00129EIu/@/@)A BIr"   r   z;
    Output type of [`SegGptImageSegmentationOutput`].
    c                       e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eeej                        ed<   dZeeej                        ed<   y)SegGptImageSegmentationOutputa  
    loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided):
        The loss value.
    pred_masks (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
        The predicted masks.
    hidden_states (`tuple[torch.FloatTensor]`, `optional`, returned when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
        of shape `(batch_size, patch_height, patch_width, hidden_size)`.
    attentions (`tuple[torch.FloatTensor]`, `optional`, returned when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape
        `(batch_size, num_heads, seq_len, seq_len)`.
    Nloss
pred_masksr   r   )r   r   r   r   r&   r   r   r   r   r'   r   r    r   r!   r"   r#   r%   r%   @   sg     )-D(5$$
%,.2J**+28<M8E%"3"345<59Ju00129r"   r%   c                   (     e Zd ZdZ fdZd Z xZS )SegGptPatchEmbeddingsz
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                    t         |           |j                  |j                  }}|j                  |j
                  }}t        |t        j                  j                        r|n||f}t        |t        j                  j                        r|n||f}|d   |d   z  |d   |d   z  z  }|| _        || _        || _        || _
        t        j                  ||||      | _        y )Nr   r   )kernel_sizestride)super__init__
image_size
patch_sizenum_channelshidden_size
isinstancecollectionsabcIterablenum_patchesr   Conv2d
projection)selfconfigr/   r0   r1   r2   r7   	__class__s          r#   r.   zSegGptPatchEmbeddings.__init__b   s    !'!2!2F4E4EJ
$*$7$79K9Kk#-j+//:R:R#SZZdfpYq
#-j+//:R:R#SZZdfpYq
!!}
15*Q-:VW=:XY$$(&))L+:^hir"   c                 N   |j                   \  }}}}|| j                  k7  rt        d      || j                  d   k7  s|| j                  d   k7  r2t        d| d| d| j                  d    d| j                  d    d	      | j	                  |      j                  ddd	d      }|S )
NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r   r   zInput image size (*z) doesn't match model ().   r   )shaper1   
ValueErrorr/   r9   permute)r:   pixel_values
batch_sizer1   heightwidth
embeddingss          r#   forwardzSegGptPatchEmbeddings.forwardp   s    2>2D2D/
L&%4,,,w  T__Q''5DOOA4F+F$VHAeW4KDOO\]L^K__`aeapapqras`ttvw  __\2::1aAF
r"   )r   r   r   r   r.   rI   __classcell__r<   s   @r#   r)   r)   [   s    jr"   r)   c                        e Zd ZdZdeddf fdZdededej                  fdZ		 	 dd	ej                  d
ej                  de
ej                     de
e   dej                  f
dZ xZS )SegGptEmbeddingszX
    Construct the embeddings from patch, position embeddings for input and prompt.
    r;   returnNc                 ~   t         |           t        j                  t	        j
                  ddd|j                              | _        t        j                  t	        j
                  ddd|j                              | _        t        j                  t	        j
                  ddd|j                              | _	        t        j                  t	        j
                  ddd|j                              | _
        t        j                  t	        j
                  ddd|j                              | _        t        |      | _        |j                  |j                  z  dz  dz   }t        j                  t	        j                   d||j                              | _        t        j$                  |j&                        | _        y )Nr   r@   )r-   r.   r   	Parameterr   zerosr2   
mask_tokensegment_token_inputsegment_token_prompttype_token_semantictype_token_instancer)   patch_embeddingspretrain_image_sizer0   randnposition_embeddingsDropouthidden_dropout_probdropout)r:   r;   num_positionsr<   s      r#   r.   zSegGptEmbeddings.__init__   s3   ,,u{{1aF<N<N'OP#%<<Aq!VEWEW0X#Y $&LLQ1fFXFX1Y$Z!#%<<Aq!VEWEW0X#Y #%<<Aq!VEWEW0X#Y  5f =33v7H7HHQNQRR#%<<A}fN`N`0a#b zz&"<"<=r"   rF   rG   c                    | j                   d d dd f   }|j                  d   }t        |dz        }t        j                  j                         s
||k7  s||k7  rSt        j                  |j                  d||d      j                  dddd      ||fdd	      }|j                  dddd      S |j                  d||d      S )
Nr         ?r   r   r@   bicubicF)sizemodealign_corners)
rZ   rA   r   r   jit
is_tracingFinterpolatereshaperC   )r:   rF   rG   patch_pos_embedr7   pretrain_patch_sizes         r#   interpolate_pos_encodingz)SegGptEmbeddings.interpolate_pos_encoding   s    221ab59%++A.'S(89 99!%8F%BFY]bFbmm''+>@SUWX``abdeghjkle_#	O #**1aA66"**1feR@@r"   rD   prompt_pixel_valuesbool_masked_posembedding_typec                 R   | j                  |      }| j                  |      }|j                  \  }}}	}
| j                  j                  |||	d      }|j	                  d      j                  |      j                  d||	d      }|d|z
  z  ||z  z   }||nd}| j                  ||	      }|| j                  z   }|| j                  z   }||z   }||z   }|dk(  r| j                  }n |dk(  r| j                  }nt        d|       ||z   }||z   }t        j                  ||fd      }|S )Nra   r   instancesemanticzBEmbedding type should be either 'semantic' or 'instance', but got r   dim)rW   rA   rR   expand	unsqueezetype_asrj   rm   rS   rT   rU   rV   rB   r   cat)r:   rD   rn   ro   rp   input_embeddingsprompt_embeddingsrE   patch_heightpatch_width_rR   w	pos_embedtype_embeddingrH   s                   r#   rI   zSegGptEmbeddings.forward   sh     00> 112EF3C3I3I0
L+q__++JkSUV
%%b)11*=EEb,Xcefg-Q7*q.H+9+E: 11,L	 ,d.F.FF-0I0II ,i7-	9 Z'!55Nz)!55Nabpaqrss+n<->YY 02CD!L
r"   )NN)r   r   r   r   r   r.   intr   Tensorrm   r   
BoolTensorstrrI   rJ   rK   s   @r#   rM   rM   ~   s    >| > > As A3 A5<< A, 7;(,+ll+ #\\+ "%"2"23	+
 !+ 
+r"   rM   c                   8    e Zd ZdZ fdZdededej                  dej                  fdZdej                  d	ej                  d
ej                  dej                  de	eef   de	eef   dej                  fdZ
ddej                  dej                  fdZ xZS )SegGptAttentionz=Multi-head Attention block with relative position embeddings.c                    t         |           |j                  |j                  }}t	        |t
        j                  j                        r|n||f}t	        |t
        j                  j                        r|n||f}|d   |j                  z  |d   |j                  z  f}|j                  |j                  z  }|j                  | _	        |dz  | _
        t        j                  |j                  |j                  dz  |j                        | _        t        j                  |j                  |j                        | _        |j                   | _        | j                   r||t#        d      t        j$                  t'        j(                  d|d   z  dz
  |            | _        t        j$                  t'        j(                  d|d   z  dz
  |            | _        y y )Nr   r   g      r   biaszBInput size must be provided if using relative positional encoding.r@   )r-   r.   r/   r0   r3   r4   r5   r6   r2   num_attention_headsscaler   Linearqkv_biasqkvproj use_relative_position_embeddingsrB   rP   r   rQ   	rel_pos_h	rel_pos_w)r:   r;   r/   r0   
input_sizehead_dimr<   s         r#   r.   zSegGptAttention.__init__   s   !'!2!2F4E4EJ
#-j+//:R:R#SZZdfpYq
#-j+//:R:R#SZZdfpYq
 mv'8'88*Q-6K\K\:\]
%%)C)CC#)#=#= t^
99V//1C1Ca1Gfoo^IIf00&2D2DE	060W0W-00! !eff  \\%++a*Q-6G!6KX*VWDN\\%++a*Q-6G!6KX*VWDN 1r"   q_sizek_sizerel_posrN   c                    t        dt        ||      z  dz
        }t        j                  |j	                  d|j
                  d   d      j                  ddd      |d      }|j	                  d|      j                  dd      }t        j                  |      dddf   t        ||z  d      z  }t        j                  |      dddf   t        ||z  d      z  }||z
  |dz
  t        ||z  d      z  z   }||j                            S )	a  
        Get relative positional embeddings according to the relative positions of
            query and key sizes.

        Args:
            q_size (int):
                size of the query.
            k_size (int):
                size of key k.
            rel_pos (`torch.Tensor`):
                relative position embeddings (L, channel).

        Returns:
            Extracted positional embeddings according to relative positions.
        r@   r   r   ra   linear)rc   rd   N      ?)
r   maxrh   ri   rj   rA   rC   r   arangelong)	r:   r   r   r   max_rel_distrel_pos_resizedq_coordsk_coordsrelative_coordss	            r#   get_rel_poszSegGptAttention.get_rel_pos   s     1s6622Q67--OOAw}}Q/4<<Q1E

 *11"lCKKAqQ <<'403v3LL<<'a03v3LL#h.6A:Vf_VYAZ2ZZ33566r"   attnqueryr   r   c                    |\  }}|\  }	}
| j                  ||	|      }| j                  ||
|      }|j                  \  }}}|j                  ||||      }t        j                  d||      }t        j                  d||      }|j                  ||||	|
      }||dddddddddf   z   |dddddddddf   z   }|j                  |||z  |	|
z        }|S )a  
        Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
        https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py

        Args:
            attn (`torch.Tensor`):
                attention map.
            query (`torch.Tensor`):
                query q in the attention layer with shape (batch_size, query_height * query_width, channel).
            rel_pos_h (`torch.Tensor`):
                relative position embeddings (Lh, channel) for height axis.
            rel_pos_w (`torch.Tensor`):
                relative position embeddings (Lw, channel) for width axis.
            q_size (tuple):
                spatial sequence size of query q with (query_height, query_width).
            k_size (tuple):
                spatial sequence size of key k with (key_height, key_width).

        Returns:
            attn (`torch.Tensor`):
                attention map with added relative positional embeddings.
        zbhwc,hkc->bhwkzbhwc,wkc->bhwkN)r   rA   rj   r   einsum)r:   r   r   r   r   r   r   query_heightquery_width
key_height	key_widthrelative_position_heightrelative_position_widthrE   r~   ru   reshaped_queryrel_hrel_ws                      r#   add_decomposed_rel_posz&SegGptAttention.add_decomposed_rel_pos  s    > %+!k &
I#'#3#3L*i#X "&"2"2;	9"U"[[
Asz<cR-~?WX-~?VW||Jk:yYeAq!Q,--aAtQ6F0GG||J{(BJQZDZ[r"   r   c           	         |j                   \  }}}}| j                  |      j                  |||z  d| j                  d      j	                  ddddd      }|j                  d|| j                  z  ||z  d      j                  d      \  }}	}
|| j                  z  |	j                  dd      z  }| j                  r.| j                  ||| j                  | j                  ||f||f      }t        j                  j                  j                  |t        j                   d      j#                  |j$                        }|rE|j'                  || j                  ||z  d      }|j'                  || j                  z  ||z  d      }nd }||
z  j                  || j                  ||d      }|j	                  ddddd      j                  |||d      }| j)                  |      }||fS )	Nr   ra   r@   r   r      )dtyperu   )rA   r   rj   r   rC   unbindr   	transposer   r   r   r   r   r   r   softmaxfloat32tor   viewr   )r:   r   output_attentionsrE   rF   rG   r~   r   r   keyvalueattn_weightsattn_weights_reshapedattn_outputs                 r#   rI   zSegGptAttention.forward;  s   '4':':$
FE1 HH]#WZ%D4L4LbQWQ1a# 	  KK:8P8P+PRX[`R`bdellmnosE

*cmmB.CC0066eT^^T^^fe_W]_dVeL xx**22<u}}Z\2]``afalalm
 %1$5$5j$BZBZ\bej\jln$o!055j4C[C[6[]cfk]kmopL$(!#e+44ZAYAY[achjlm!))!Q1a8@@VUZ\^_ii,233r"   )F)r   r   r   r   r.   r   r   r   r   r    r   rI   rJ   rK   s   @r#   r   r      s    GX07# 7s 7U\\ 7ell 7@+ll+ ||+ <<	+
 <<+ c3h+ c3h+ 
+Z#4U\\ #4u|| #4r"   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )	SegGptMlpc                    t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        |j                     | _
        y N)r-   r.   r   r   r2   mlp_dimlin1lin2r	   
hidden_actactr:   r;   r<   s     r#   r.   zSegGptMlp.__init__c  sX    IIf00&..A	IIfnnf.@.@A	&++,r"   r   rN   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r   r   r   r:   r   s     r#   rI   zSegGptMlp.forwardi  s2    		-0/		-0r"   )r   r   r   r.   r   r   rI   rJ   rK   s   @r#   r   r   b  s#    -U\\ ell r"   r   input	drop_probtrainingrN   c                    |dk(  s|s| S d|z
  }| j                   d   fd| j                  dz
  z  z   }|t        j                  || j                  | j
                        z   }|j                          | j                  |      |z  }|S )aF  
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
            r   r   )r   r   device)rA   ndimr   randr   r   floor_div)r   r   r   	keep_probrA   random_tensoroutputs          r#   	drop_pathr   q  s     CxII[[^

Q 77E

5ELL YYMYYy!M1FMr"   c                   x     e Zd ZdZd	dee   ddf fdZdej                  dej                  fdZ	de
fdZ xZS )
SegGptDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr   rN   c                 0    t         |           || _        y r   )r-   r.   r   )r:   r   r<   s     r#   r.   zSegGptDropPath.__init__  s    "r"   r   c                 D    t        || j                  | j                        S r   )r   r   r   r   s     r#   rI   zSegGptDropPath.forward  s    FFr"   c                      d| j                    S )Nzp=)r   r:   s    r#   
extra_reprzSegGptDropPath.extra_repr  s    DNN#$$r"   r   )r   r   r   r   r   floatr.   r   r   rI   r   r   rJ   rK   s   @r#   r   r     sG    b#(5/ #T #GU\\ Gell G%C %r"   r   c                        e Zd Zdededdf fdZ	 	 ddej                  dede	d	e	de
eej                  ej                  f   eej                     f   f
d
Z xZS )SegGptLayerr;   drop_path_raterN   Nc                 t   t         |           t        |      | _        t	        |      | _        |dkD  rt        |      nt        j                         | _	        t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        y )Nr   eps)r-   r.   r   	attentionr   mlpr   r   Identityr   	LayerNormr2   layer_norm_epslayernorm_beforelayernorm_after)r:   r;   r   r<   s      r#   r.   zSegGptLayer.__init__  s    (0V$;IC;O7UWU`U`Ub "V-?-?VEZEZ [!||F,>,>FDYDYZr"   r   ensemble_condfeature_ensembler   c                    | j                  | j                  |      |      }|d   }|dd  }|r|j                  d   dz  |k\  r|j                  |j                  d   dz  d      \  }}	|dk(  ra|j                  d   dz  }
|	j	                  d|
d      }	|	j                  dd      j                  |	      }	 |	j                  |j                   }	n"|	j                  dd      j                  |	      }	t        j                  ||	gd      }| j                  |      |z   }|}| j                  |      }| j                  |      }|| j                  |      z   }|f|z   }|S )	N)r   r   r   r@   rt   ra   T)ru   keepdim)r   r   rA   splitrj   mean	expand_asr   ry   r   r   r   )r:   r   r   r   r   self_attention_outputsattention_outputoutputspromptinputsnum_promptsresiduals               r#   rI   zSegGptLayer.forward  sz    "&!!-0/ "0 "
 2!4(, 0 6 6q 9Q >- O-334D4J4J14MQR4RXY3ZNFF!.44Q71<;;D9CCFK'6D9CCFK$yy&&)9qA '78=H ,,];/ 4>>-#@@ "W,r"   )FF)r   r   r   r   r   r.   r   r   r   boolr   r    rI   rJ   rK   s   @r#   r   r     s    [| [U [t [ "'"'#||# # 	#
  # 
uU\\5<</0%2EE	F#r"   r   c                   p     e Zd Zdeddf fdZ	 	 	 	 ddej                  dededed	edee	e
f   fd
Z xZS )SegGptEncoderr;   rN   Nc           
         t         |           || _        t        j                  d|j
                  |j                  d      D cg c]  }|j                          }}t        j                  t        |j                        D cg c]  }t        |||          c}      | _        t        j                  |j                  |j                        | _        d| _        y c c}w c c}w )Nr   cpu)r   r   F)r-   r.   r;   r   linspacer   num_hidden_layersitemr   
ModuleListranger   layersr   r2   r   	layernormgradient_checkpointing)r:   r;   xdprir<   s        r#   r.   zSegGptEncoder.__init__  s    !&63H3H&JbJbkp!qrAqvvxrrmm%PVPhPhJi$jQ[Q%@$jkf&8&8f>S>ST&+# s$js   CC$r   r   r   output_hidden_statesreturn_dictc                 6   |rdnd }|rdnd }g }t        | j                        D ]  \  }	}
|r||fz   }| j                  j                  |	kD  rdnd} |
||||      }|d   }|	| j                  j                  k(  r.|d |j                  d   dz   ||j                  d   dz  d  z   dz  }|	| j                  j
                  v r |j                  | j                  |             |s||d   fz   } |r||fz   }|st        d ||||fD              S t        ||||      S )Nr!   r@   r   r   r`   c              3   $   K   | ]  }|| 
 y wr   r!   ).0vs     r#   	<genexpr>z(SegGptEncoder.forward.<locals>.<genexpr>  s      = s   )r   r   r   r   )
	enumerater  r;   merge_indexrA   !intermediate_hidden_state_indicesappendr  r    r   )r:   r   r   r   r  r  all_hidden_statesall_self_attentionsr   r  layer_moduler   layer_outputss                r#   rI   zSegGptEncoder.forward  sr    #7BD$5b4%'"(5 	POA|#$58H$H! "&!8!81!<A!M(GWYjkM)!,MDKK+++!"?M$7$7$:a$?@=Q^QdQdefQgklQlQnCoo! DKKAAA*11$..2OP &9]1=M<O&O#)	P,   1]4D D '):<OQkl  
 #++*'A	
 	
r"   )FFFT)r   r   r   r   r.   r   r   r  r   r    r   rI   rJ   rK   s   @r#   r  r    sr    ,| , , "'"'%* 0
||0
 0
  	0

 #0
 0
 
u))	*0
r"   r  c                   \     e Zd ZdZd fd	Zdej                  dej                  fdZ xZS )SegGptLayerNormaA  LayerNorm that supports two data formats: channels_last (default) or channels_first.
    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with shape (batch_size, height,
    width, channels) while channels_first corresponds to inputs with shape (batch_size, channels, height, width).
    c                 N   t         |           t        j                  t	        j
                  |            | _        t        j                  t	        j                  |            | _        || _	        || _
        | j                  dvrt        d| j                         |f| _        y )N)channels_lastchannels_firstzUnsupported data format: )r-   r.   r   rP   r   onesweightrQ   r   r   data_formatNotImplementedErrornormalized_shape)r:   r)  r   r'  r<   s       r#   r.   zSegGptLayerNorm.__init__  s    ll5::.>#?@LL-=!>?	&#FF%(A$BRBRAS&TUU!1 3r"   r  rN   c                 d   | j                   dk(  rWt        j                  j                  j	                  || j
                  | j                  | j                  | j                        }|S | j                   dk(  r|j                  }|j                         }|j                  dd      }||z
  j                  d      j                  dd      }||z
  t        j                  || j                  z         z  }|j                  |      }| j                  d d d d f   |z  | j                  d d d d f   z   }|S )Nr#  r$  r   T)r   r@   )r   )r'  r   r   r   
layer_normr)  r&  r   r   r   r   r   powsqrtr   )r:   r  input_dtypeuss        r#   rI   zSegGptLayerNorm.forward  s
   .##..q$2G2GVZV_V_aeaiaijA  !11''K	Aq$'AQA##At#4AQ%**Q\22A;'AAtTM*Q.1dD=1IIAr"   )gư>r#  )	r   r   r   r   r.   r   r   rI   rJ   rK   s   @r#   r!  r!     s(    
4 %,, r"   r!  c                   >     e Zd Z fdZdej
                  fdZ xZS )SegGptDecoderHeadc                 T   t         |           t        j                  |j                  |j                  dd      | _        t        |j                  |j                  d      | _        t        |j                     | _        t        j                  |j                  ddd      | _        y )Nr   r   )r+   paddingr$  )r)  r   r'  T)r+   r   )r-   r.   r   r8   decoder_hidden_sizeconvr!  r   r  r	   r   act_fctheadr   s     r#   r.   zSegGptDecoderHead.__init__  s    II&&&&	
	 )#77V=R=R`p
 f//0IIf88!QUV	r"   r   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }|S r   )r6  r  r7  r8  r   s     r#   rI   zSegGptDecoderHead.forward-  s@    		-0}5]3		-0r"   )r   r   r   r.   r   r   rI   rJ   rK   s   @r#   r2  r2    s    WU%6%6 r"   r2  c                   v     e Zd Z fdZdej
                  dej
                  fdZdej
                  fdZ xZS )SegGptDecoderc                 B   t         |           t        j                  |j                  t        |j                        z  |j                  dz  |j                  z  d      | _	        t        |      | _        |j                  | _        |j                  | _        || _        y )Nr@   Tr   )r-   r.   r   r   r2   lenr  r0   r5  decoder_embedr2  decoder_predr;   r   s     r#   r.   zSegGptDecoder.__init__7  s    YYV%M%M!NNq 6#=#==

 .f5 ++#)#=#= r"   r   rN   c                    |j                   \  }}}}|j                  |||| j                  | j                  | j                        }|j	                  dddddd      }|j                  |d|| j                  z  || j                  z  f      }|S )	Nr      r   r   r@   r   ra   rA   )rA   rj   r0   r5  rC   )r:   r   rE   r|   r}   r~   s         r#   _reshape_hidden_statesz$SegGptDecoder._reshape_hidden_statesC  s    3@3F3F0
L+q%--k4??DOOUYUmUm
 &--aAq!Q?%--r<$//#A;QUQ`Q`C`a . 
 r"   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r>  rC  r?  r   s     r#   rI   zSegGptDecoder.forwardO  s8    **=933MB))-8r"   )	r   r   r   r.   r   r   rC  rI   rJ   rK   s   @r#   r;  r;  6  s9    

E4E4E 
%J[J[ 
U%6%6 r"   r;  c                   P    e Zd ZU eed<   dZdZdZddgZde	j                  dd	fd
Zy	)SegGptPreTrainedModelr;   modelrD   TrM   r   modulerN   Nc                 >   | j                   j                  }t        |t        j                  t        j
                  f      rt        j                  j                  |j                  j                  j                  t        j                        d|      j                  |j                  j                        |j                  _	        |j                  %|j                  j                  j                          yyt        |t        j                   t"        f      rJ|j                  j                  j                          |j                  j                  j%                  d       yt        |t&              rt        j                  j                  |j(                  j                  j                  t        j                        d|      j                  |j(                  j                        |j(                  _	        t        j                  j                  |j*                  j                  j                  t        j                        d|      j                  |j*                  j                        |j*                  _	        yt        |t,              rt        j                  j                  |j.                  j                  j                  t        j                        d|      j                  |j.                  j                        |j.                  _	        t        j                  j                  j1                  |j2                  |       t        j                  j                  j1                  |j4                  |       t        j                  j                  j1                  |j6                  |       t        j                  j                  j1                  |j8                  |       t        j                  j                  j1                  |j:                  |       yy)zInitialize the weightsr   )r   stdNr   )rJ  )r;   initializer_ranger3   r   r   r8   inittrunc_normal_r&  datar   r   r   r   r   zero_r   r!  fill_r   r   r   rM   rZ   normal_rR   rS   rT   rU   rV   )r:   rH  rJ  s      r#   _init_weightsz#SegGptPreTrainedModel._init_weights_  s   kk++fryy"))45 "$!6!6v}}7I7I7L7LU]]7[bekn!6!o!r!r##"FMM {{&  &&( ' ?@KK""$MM$$S)0$&GG$9$9  %%((7 %: % b!!''(	 ! %'GG$9$9  %%((7 %: % b!!''(	 !  01.0gg.C.C**//225==A /D / b++112	 &&+ HHMM!!&"3"3!=HHMM!!&"<"<#!FHHMM!!&"="=3!GHHMM!!&"<"<#!FHHMM!!&"<"<#!F 2r"   )r   r   r   r   r   base_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modulesr   ModulerR  r!   r"   r#   rF  rF  W  s>    $O&*#+];&GBII &G$ &Gr"   rF  c                   6    e Zd Zdef fdZdefdZdeee	e   f   ddfdZ
e	 	 	 	 	 	 	 ddej                  d	ej                  d
ej                  deej                     dee   dee   deej$                     dee   dee   dee   deeef   fd       Z xZS )SegGptModelr;   c                     t         |   |       || _        t        |      | _        t        |      | _        | j                          y r   )r-   r.   r;   rM   rH   r  encoder	post_initr   s     r#   r.   zSegGptModel.__init__  s;     *62$V, 	r"   rN   c                 .    | j                   j                  S r   )rH   rW   r   s    r#   get_input_embeddingsz SegGptModel.get_input_embeddings  s    ///r"   heads_to_pruneNc                     |j                         D ]7  \  }}| j                  j                  |   j                  j	                  |       9 y)z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr[  layerr   prune_heads)r:   r_  rb  headss       r#   _prune_headszSegGptModel._prune_heads  sE    
 +002 	CLE5LLu%//;;EB	Cr"   rD   rn   prompt_masksro   r   rp   labelsr   r  r  c                    ||n| j                   j                  }|	|	n| j                   j                  }	|
|
n| j                   j                  }
||nd}| j                  j
                  j                  j                  j                  }|j                  |      }|j                  |      }t        j                  ||fd      }|t        j                  ||fd      nt        j                  ||fd      }||t        j                  d       || j                  j
                  j                  }t        j                  |dz  t        j                   |j"                        }t        j$                  ||dz  z
  t        j                   |j"                        }t        j                  ||g      }|j'                  d      }| j	                  ||||      }| j)                  ||||	|
      }|S )	a
  
        prompt_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Prompt pixel values. Prompt pixel values can be obtained using [`AutoImageProcessor`]. See
            [`SegGptImageProcessor.__call__`] for details.
        prompt_masks (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Prompt mask. Prompt mask can be obtained using [`AutoImageProcessor`]. See [`SegGptImageProcessor.__call__`] for
            details.
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        feature_ensemble (`bool`, *optional*):
            Boolean indicating whether to use feature ensemble or not. If `True`, the model will use feature ensemble
            if we have at least two prompts. If `False`, the model will not use feature ensemble. This argument should
            be considered when doing few-shot inference on an input image i.e. more than one prompt for the same image.
        embedding_type (`str`, *optional*):
            Embedding type. Indicates whether the prompt is a semantic or instance embedding. Can be either
            instance or semantic.
        labels (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`, `optional`):
            Ground truth mask for input images.

        Examples:

        ```python
        >>> from transformers import SegGptImageProcessor, SegGptModel
        >>> from PIL import Image
        >>> import requests

        >>> image_input_url = "https://raw.githubusercontent.com/baaivision/Painter/main/SegGPT/SegGPT_inference/examples/hmbb_2.jpg"
        >>> image_prompt_url = "https://raw.githubusercontent.com/baaivision/Painter/main/SegGPT/SegGPT_inference/examples/hmbb_1.jpg"
        >>> mask_prompt_url = "https://raw.githubusercontent.com/baaivision/Painter/main/SegGPT/SegGPT_inference/examples/hmbb_1_target.png"

        >>> image_input = Image.open(requests.get(image_input_url, stream=True).raw)
        >>> image_prompt = Image.open(requests.get(image_prompt_url, stream=True).raw)
        >>> mask_prompt = Image.open(requests.get(mask_prompt_url, stream=True).raw).convert("L")

        >>> checkpoint = "BAAI/seggpt-vit-large"
        >>> model = SegGptModel.from_pretrained(checkpoint)
        >>> image_processor = SegGptImageProcessor.from_pretrained(checkpoint)

        >>> inputs = image_processor(images=image_input, prompt_images=image_prompt, prompt_masks=mask_prompt, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> list(outputs.last_hidden_state.shape)
        [1, 56, 28, 1024]
        ```
        Fr@   rt   zLabels were provided, but bool_masked_pos were not. It will be set to default value. If you're training the model, make sure to provide a bool_masked_pos.r   r   )rp   ro   )r   r   r  r  )r;   r   r  use_return_dictrH   rW   r9   r&  r   r   r   ry   loggerwarning_oncer7   rQ   r  r   r%  rw   r[  )r:   rD   rn   rf  ro   r   rp   rg  r   r  r  expected_dtyper7   bool_masked_pos_zerosbool_masked_pos_onesembedding_outputencoder_outputss                    r#   rI   zSegGptModel.forward  s   v 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]/?/K+QV99DDKKQQ#~6144^D yy"5|!D!L ~ II|\2:L&1q9 	 "v'9 m "//::FFK$)KKq0@

[g[n[n$o!#(::kQ..ejjI\I\$  $ii)>@T(UVO-77:O??-n^m + 
 ,,-/!5# ' 
 r"   NNNNNNN)r   r   r   r   r.   r)   r^  dictr   listre  r   r   r   r   r   r  r   r   r   r    r   rI   rJ   rK   s   @r#   rY  rY    s$   | 0&; 0C4T#Y+? CD C  7;+/(,.2,0/3&*kllk #\\k ll	k
 "%"2"23k #4.k !k **+k $D>k 'tnk d^k 
u))	*k kr"   rY  tensorr0   c                     | j                   \  }}}}||z  }||z  }| j                  ||||||f      } | j                  dddddd      } | j                  |||z  |dz  dz  f      } | S )NrB  r   r@   r   r   rA  r   )rA   rj   rC   )rt  r0   rE   r1   rF   rG   r|   r}   s           r#   patchifyrv    s    .4ll+JfeZ'L:%K^^:|\:Wbdn"o^pF^^Aq!Q1-F^^:|k/I:WX=[\K\"]^^FMr"   r|   r}   c           	      b   | j                   d   }t        | j                   d   dz  dz        }||z  | j                   d   k7  r"t        d| j                   d    d| d| d	      | j                  |||||df
      } | j	                  dddddd      } | j                  |d||z  ||z  f
      } | S )Nr   ra   r   r`   r   zNumber of patches z does not match patch height (z) and width (r?   rB  rA  r@   r   )rA   r   rB   rj   rC   )rt  r|   r}   rE   r0   s        r#   
unpatchifyrx    s    aJfll2&*s23Jk!V\\!_4 a 11OP\~]jkvjwwyz
 	
 ^^:|[*V`bc"d^eF^^Aq!Q1-F^^:q,2K[[eMe"f^gFMr"   c                        e Zd Z fdZdej
                  dej
                  dej
                  dej                  fdZ xZS )
SegGptLossc                 f    t         |           |j                  | _        |j                  | _        y r   )r-   r.   betar0   r   s     r#   r.   zSegGptLoss.__init__*  s&    KK	 ++r"   rf  r'   rg  ro   c                    t        j                  ||fd      }|dddddf   j                  dd| j                  dz  dz        }t	        ||j
                  d   | j                  z  |j
                  d   | j                  z        }t        j                  ||d| j                        }||z  j                         |j                         z  }|S )aN  Computes the L1 loss between the predicted masks and the ground truth masks.

        Args:
            prompt_masks (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
                Pixel values from mask prompt.

            pred_masks (`torch.FloatTensor` of shape `(batch_size, num_channels, 2*height, width)`):
                Predicted masks.

            labels (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
                Ground truth mask for input images.

            bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
                Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).

        Returns:
            `torch.FloatTensor`: The mean L1 loss between the predicted masks and the ground truth masks.
        r@   rt   Nr   r   none)	reductionr|  )
r   ry   repeatr0   rx  rA   rh   smooth_l1_lossr|  sum)r:   rf  r'   rg  ro   ground_truthmaskr&   s           r#   rI   zSegGptLoss.forward/  s    2 yy,!7Q?q!Tz*11!Q8JQ8NO$ 2 21 5 H,J\J\]^J_cgcrcrJrs
LFQUQZQZ[t  "TXXZ/r"   )	r   r   r   r.   r   r   r   rI   rJ   rK   s   @r#   rz  rz  )  sK    ,
!''! %%! !!	!
 ))!r"   rz  zM
    SegGpt model with a decoder on top for one-shot image segmentation.
    c                   
    e Zd Zdef fdZe	 	 	 	 	 	 	 ddej                  dej                  dej                  deej                     dee
   dee   d	eej                     d
ee
   dee
   dee
   deeef   fd       Z xZS )SegGptForImageSegmentationr;   c                     t         |   |       || _        t        |      | _        t        |      | _        | j                          y r   )r-   r.   r;   rY  rG  r;  decoderr\  r   s     r#   r.   z#SegGptForImageSegmentation.__init__Y  s;      (
$V, 	r"   rD   rn   rf  ro   r   rp   rg  r   r  r  rN   c                    ||n| j                   j                  }|	|	n| j                   j                  }	|
|
n| j                   j                  }
|| j                  j
                  j                  j                  }t        j                  |dz  t        j                  |j                        }t        j                  ||dz  z
  t        j                  |j                        }t        j                  ||g      }|j                  d      }| j	                  |||||||||	|

      }|
r|j                  n|d   }t        j                  |d      }| j!                  |      }d}| t#        | j                         } |||||      }|
s)|f}|	r	||d   fz   }|r|	rdnd}|||   fz   }||f|z   }|S t%        |||j&                  |j(                  	      S )
aY  
        prompt_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Prompt pixel values. Prompt pixel values can be obtained using [`AutoImageProcessor`]. See
            [`SegGptImageProcessor.__call__`] for details.
        prompt_masks (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Prompt mask. Prompt mask can be obtained using [`AutoImageProcessor`]. See [`SegGptImageProcessor.__call__`] for
            details.
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        feature_ensemble (`bool`, *optional*):
            Boolean indicating whether to use feature ensemble or not. If `True`, the model will use feature ensemble
            if we have at least two prompts. If `False`, the model will not use feature ensemble. This argument should
            be considered when doing few-shot inference on an input image i.e. more than one prompt for the same image.
        embedding_type (`str`, *optional*):
            Embedding type. Indicates whether the prompt is a semantic or instance embedding. Can be either
            instance or semantic.
        labels (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`, `optional`):
            Ground truth mask for input images.

        Examples:

        ```python
        >>> from transformers import SegGptImageProcessor, SegGptForImageSegmentation
        >>> from PIL import Image
        >>> import requests

        >>> image_input_url = "https://raw.githubusercontent.com/baaivision/Painter/main/SegGPT/SegGPT_inference/examples/hmbb_2.jpg"
        >>> image_prompt_url = "https://raw.githubusercontent.com/baaivision/Painter/main/SegGPT/SegGPT_inference/examples/hmbb_1.jpg"
        >>> mask_prompt_url = "https://raw.githubusercontent.com/baaivision/Painter/main/SegGPT/SegGPT_inference/examples/hmbb_1_target.png"

        >>> image_input = Image.open(requests.get(image_input_url, stream=True).raw)
        >>> image_prompt = Image.open(requests.get(image_prompt_url, stream=True).raw)
        >>> mask_prompt = Image.open(requests.get(mask_prompt_url, stream=True).raw).convert("L")

        >>> checkpoint = "BAAI/seggpt-vit-large"
        >>> model = SegGptForImageSegmentation.from_pretrained(checkpoint)
        >>> image_processor = SegGptImageProcessor.from_pretrained(checkpoint)

        >>> inputs = image_processor(images=image_input, prompt_images=image_prompt, prompt_masks=mask_prompt, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> result = image_processor.post_process_semantic_segmentation(outputs, target_sizes=[(image_input.height, image_input.width)])[0]
        >>> print(list(result.shape))
        [170, 297]
        ```
        Nr@   r   r   )
rD   rn   rf  ro   r   rp   rg  r   r  r  ra   rt   r   )r&   r'   r   r   )r;   r   r  ri  rG  rH   rW   r7   r   rQ   r  r   r%  ry   rw   r   r  rz  r%   r   r   )r:   rD   rn   rf  ro   r   rp   rg  r   r  r  r7   rm  rn  r   r   r'   r&   loss_fnr   idxs                        r#   rI   z"SegGptForImageSegmentation.forwardc  s   v 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]"**//@@LLK$)KKq0@

[g[n[n$o!#(::kQ..ejjI\I\$  $ii)>@T(UVO-77:O**% 3%+-)/!5#  
 LWW%G%G\cdf\g"%*YY/Ir%R"\\"<=
 -G<V_MD ]F#71:-/ /aQ73</16)M,!!//))	
 	
r"   rq  )r   r   r   r   r.   r   r   r   r   r   r  r   r   r   r    r%   rI   rJ   rK   s   @r#   r  r  S  s    |   7;+/(,.2,0/3&*q
llq
 #\\q
 ll	q

 "%"2"23q
 #4.q
 !q
 **+q
 $D>q
 'tnq
 d^q
 
u33	4q
 q
r"   r  )rY  rF  r  )r   F)7r   collections.abcr4   dataclassesr   typingr   r   r   torch.utils.checkpointr   torch.nnr   rh   activationsr	   modeling_layersr
   modeling_utilsr   utilsr   r   r   r   configuration_seggptr   
get_loggerr   rj  r   r%   rW  r)   rM   r   r   r   r   r  r   r   r   r  r!  r2  r;  rF  rY  r   rv  rx  rz  r  __all__r!   r"   r#   <module>r     sB     ! "    $ ! 9 - D D . 
		H	% 
J+ J J, 
:K : :* BII  FRryy RjK4bii K4^		 U\\ e T V[VbVb *%RYY %,, ,^9
BII 9
zbii <		 0BII B -GO -G -G` B' B BJ	U\\ 	s 	u|| 	u|| 3 S U\\ ' 'T 
}
!6 }

}
@ Qr"   