
    rh                        d Z ddlZddlZddlmZ ddlmZmZmZm	Z	 ddl
Z
ddlZ
ddl
mZ ddlmZ ddlmZmZ dd	lmZ dd
lmZmZ ddlmZmZ ddlmZmZmZmZmZ ddl m!Z!m"Z"m#Z#  ejH                  e%      Z&de
jN                  de
jN                  fdZ(de
jN                  de
jN                  fdZ)ee G d de                    Z*ee G d de                    Z+ee G d de                    Z, G d dejZ                        Z. G d dejZ                        Z/	 dCdejZ                  de
jN                  d e
jN                  d!e
jN                  d"ee
jN                     d#e0d$e0fd%Z1 G d& d'ejZ                        Z2 G d( d)ejZ                        Z3 G d* d+e      Z4e G d, d-e             Z5 G d. d/ejZ                        Z6 G d0 d1ejZ                        Z7 G d2 d3e5      Z8 G d4 d5ejZ                        Z9 G d6 d7e5      Z:e G d8 d9e5             Z; G d: d;ejZ                        Z< G d< d=e5      Z= ed>?       G d@ dAe5             Z>g dBZ?y)DzPyTorch CLIPSeg model.    N)	dataclass)AnyCallableOptionalUnion)nn   )ACT2FN) _create_4d_causal_attention_mask_prepare_4d_attention_mask)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)ModelOutputauto_docstringcan_return_tuplelogging	torch_int   )CLIPSegConfigCLIPSegTextConfigCLIPSegVisionConfiglogitsreturnc                     t         j                  j                  | t        j                  t        |       | j                              S )Ndevice)r   
functionalcross_entropytorcharangelenr   )r   s    /var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/clipseg/modeling_clipseg.pycontrastive_lossr&   (   s/    ==&&vu||CKPVP]P]/^__    
similarityc                 Z    t        |       }t        | j                               }||z   dz  S )Ng       @)r&   t)r(   caption_loss
image_losss      r%   clipseg_lossr-   -   s,    #J/L!*,,.1J:%,,r'   c                      e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eej                     ed<   dZeej                     ed<   dZeej                     ed<   dZeed<   dZeed	<   d
ee   fdZy)CLIPSegOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
        Contrastive loss for image-text similarity.
    logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
        The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
        similarity scores.
    logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
        The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
        similarity scores.
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The text embeddings obtained by applying the projection layer to the pooled output of [`CLIPSegTextModel`].
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The image embeddings obtained by applying the projection layer to the pooled output of [`CLIPSegVisionModel`].
    text_model_output (`BaseModelOutputWithPooling`):
        The output of the [`CLIPSegTextModel`].
    vision_model_output (`BaseModelOutputWithPooling`):
        The output of the [`CLIPSegVisionModel`].
    Nlosslogits_per_imagelogits_per_texttext_embedsimage_embedstext_model_outputvision_model_outputr   c                 H     t         fd j                         D              S )Nc              3   d   K   | ]'  }|d vr|   nt        |      j                          ) yw))r5   r6   Ngetattrto_tuple.0kselfs     r%   	<genexpr>z)CLIPSegOutput.to_tuple.<locals>.<genexpr>S   s=      
  LLDGRYZ^`aRbRkRkRmm
   -0tuplekeysr?   s   `r%   r;   zCLIPSegOutput.to_tupleR   #     
YY[
 
 	
r'   )__name__
__module____qualname____doc__r0   r   r"   FloatTensor__annotations__r1   r2   r3   r4   r5   r   r6   rC   r   r;    r'   r%   r/   r/   3   s    & )-D(5$$
%,48hu001837OXe//07/3K%++,304L(5,,-448186:3:
%* 
r'   r/   c                       e Zd ZU dZdZeej                     ed<   dZ	ee
ej                        ed<   dZee
ej                        ed<   y)CLIPSegDecoderOutputz|
    logits (`torch.FloatTensor` of shape `(batch_size, height, width)`):
        Classification scores for each pixel.
    Nr   hidden_states
attentions)rG   rH   rI   rJ   r   r   r"   rK   rL   rP   rC   rQ   rM   r'   r%   rO   rO   Y   sR    
 +/FHU&&'.8<M8E%"3"345<59Ju00129r'   rO   c                       e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eej                     ed<   dZeej                     ed<   dZeed<   dZeed<   d	ee   fd
Zy)CLIPSegImageSegmentationOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Binary cross entropy loss for segmentation.
    logits (`torch.FloatTensor` of shape `(batch_size, height, width)`):
        Classification scores for each pixel.
    conditional_embeddings (`torch.FloatTensor` of shape `(batch_size, projection_dim)`):
        Conditional embeddings used for segmentation.
    pooled_output (`torch.FloatTensor` of shape `(batch_size, embed_dim)`):
        Pooled output of the [`CLIPSegVisionModel`].
    vision_model_output (`BaseModelOutputWithPooling`):
        The output of the [`CLIPSegVisionModel`].
    decoder_output (`CLIPSegDecoderOutput`):
        The output of the [`CLIPSegDecoder`].
    Nr0   r   conditional_embeddingspooled_outputr6   decoder_outputr   c                 H     t         fd j                         D              S )Nc              3   d   K   | ]'  }|d vr|   nt        |      j                          ) yw))r6   rV   Nr9   r<   s     r%   r@   z:CLIPSegImageSegmentationOutput.to_tuple.<locals>.<genexpr>   s<      
  IIDGwW[]^O_OhOhOjj
rA   rB   rE   s   `r%   r;   z'CLIPSegImageSegmentationOutput.to_tuple   rF   r'   )rG   rH   rI   rJ   r0   r   r"   rK   rL   r   rT   rU   r6   r   rV   rO   rC   r   r;   rM   r'   r%   rS   rS   f   s     )-D(5$$
%,*.FHU&&'.:>HU%6%67>15M8E--.56:3:+/N(/
%* 
r'   rS   c                        e Zd Zdef fdZdej                  dededej                  fdZd
dej                  dej                  fd	Z
 xZS )CLIPSegVisionEmbeddingsconfigc                    t         |           || _        |j                  | _        |j
                  | _        |j                  | _        t        j                  t        j                  | j                              | _        t        j                  |j                  | j                  | j                  | j                  d      | _        | j
                  | j                  z  dz  | _        | j                  dz   | _        t        j"                  | j                   | j                        | _        | j'                  dt        j(                  | j                         j+                  d      d       y )NF)in_channelsout_channelskernel_sizestridebias   r   position_idsr   
persistent)super__init__r[   hidden_size	embed_dim
image_size
patch_sizer   	Parameterr"   randnclass_embeddingConv2dnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferr#   expandr?   r[   	__class__s     r%   ri   z CLIPSegVisionEmbeddings.__init__   s	   ++ ++ ++!||EKK,GH!yy++?? 
 !OOt>1D!--1"$,,t/A/A4>>"R^U\\$:L:L-M-T-TU\-]jopr'   
embeddingsheightwidthr   c                    |j                   d   dz
  }| j                  j                  j                  d      }|j                   d   dz
  }t        j
                  j                         s%||k(  r ||k(  r| j                  | j                        S |ddddf   }|ddddf   }|j                   d   }	|| j                  z  }
|| j                  z  }t        |dz        }|j                  d|||	      }|j                  dddd      }t        j                  j                  ||
|fdd	
      }|j                  dddd      j                  dd|	      }t	        j                   ||fd      S )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r   Nre   g      ?r	   rb   bicubicF)sizemodealign_cornersdim)shaperw   weight	unsqueezer"   jit
is_tracingrc   rm   r   reshapepermuter   r    interpolateviewcat)r?   r|   r}   r~   rt   rw   ru   class_pos_embedpatch_pos_embedr   
new_height	new_widthsqrt_num_positionss                r%   interpolate_pos_encodingz0CLIPSegVisionEmbeddings.interpolate_pos_encoding   sv    !&&q)A-!44;;EEaH*003a7 yy##%+*F6UZ?**4+<+<==,QU3,QU3r"t.
T__,	&}c'9:)11!5GI[]`a)11!Q1=--33i(	 4 
 *11!Q1=BB1b#Nyy/?;CCr'   pixel_valuesc                     |j                   \  }}}}|sJ|| j                  k7  s|| j                  k7  r,t        d| d| d| j                   d| j                   d	      | j                  |      }|j	                  d      j                  dd      }| j                  j                  |dd      }t        j                  ||gd      }	|r|	| j                  |	||      z   }	|	S |	| j                  | j                        z   }	|	S )	NzInput image size (*z) doesn't match model ().rb   r   re   r   )r   rl   
ValueErrorrs   flatten	transposerp   ry   r"   r   r   rw   rc   )
r?   r   r   
batch_size_r}   r~   patch_embedsclass_embedsr|   s
             r%   forwardzCLIPSegVisionEmbeddings.forward   s   '3'9'9$
Avu'Vt-F%SWSbSbJb$VHAeW4KDOOK\\]^b^m^m]nnpq  ++L9#++A.88A>++22:q"EYYl;C
##d&C&CJPVX]&^^J  $d&=&=d>O>O&PPJr'   )T)rG   rH   rI   r   ri   r"   Tensorintr   rK   r   __classcell__r{   s   @r%   rZ   rZ      se    q2 q,'D5<< 'D 'DUX 'D]b]i]i 'DRE$5$5 Y^YeYe r'   rZ   c            	            e Zd Zdef fdZ	 	 	 ddeej                     deej                     deej                     dej                  fdZ
 xZS )	CLIPSegTextEmbeddingsr[   c                 N   t         |           |j                  }t        j                  |j
                  |      | _        t        j                  |j                  |      | _        | j                  dt        j                  |j                        j                  d      d       y )Nrc   rd   Frf   )rh   ri   rj   r   rv   
vocab_sizetoken_embeddingmax_position_embeddingsrw   rx   r"   r#   ry   r?   r[   rk   r{   s      r%   ri   zCLIPSegTextEmbeddings.__init__   s    &&	!||F,=,=yI"$,,v/M/My"Y 	ELL)G)GHOOPWXej 	 	
r'   	input_idsrc   inputs_embedsr   c                 8   ||j                   d   n|j                   d   }| j                  j                  j                   d   }||kD  rt        d| d|       || j                  d d d |f   }|| j                  |      }| j                  |      }||z   }|S )Nre   r   zRSequence length must be less than max_position_embeddings (got `sequence length`: z and max_position_embeddings: )r   rw   r   r   rc   r   )r?   r   rc   r   
seq_lengthmax_position_embeddingposition_embeddingsr|   s           r%   r   zCLIPSegTextEmbeddings.forward   s     -6,AY__R(}GZGZ[]G^
!%!8!8!?!?!E!Ea!H..d,<=S<TV 
 ,,Q^<L  00;M"55lC"%88
r'   )NNN)rG   rH   rI   r   ri   r   r"   
LongTensorrK   r   r   r   r   s   @r%   r   r      sk    

0 

 153759	E,,- u//0   1 12	
 
r'   r   modulequerykeyvalueattention_maskscalingdropoutc                    t        j                  ||j                  dd            |z  }|||z   }t        j                  j                  |dt         j                        j                  |j                        }t        j                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )Nre   r   )r   dtype)ptrainingr   rb   )r"   matmulr   r   r    softmaxfloat32tor   r   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputs
             r%   eager_attention_forwardr     s     <<s}}R'<=GL!#n4==((2U]](SVVW\WbWbcL==((6??([L,,|U3K''1-88:K$$r'   c                        e Zd ZdZdeeef   f fdZ	 	 	 d
dej                  de
ej                     de
ej                     de
e   deej                  e
ej                     f   f
d	Z xZS )CLIPSegAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr[   c                    t         |           || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        | j                  | j                  z  | j                  k7  r&t        d| j                   d| j                   d      | j                  dz  | _	        |j                  | _        d| _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: r         F)rh   ri   r[   rj   rk   num_attention_heads	num_headshead_dimr   scaleattention_dropoutr   	is_causalr   Lineark_projv_projq_projout_projrz   s     r%   ri   zCLIPSegAttention.__init__  s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..Ar'   rP   r   causal_attention_maskoutput_attentionsr   c           
         |j                   \  }}}| j                  |      }| j                  |      }	| j                  |      }
|j	                  ||| j
                  | j                        j                  dd      }|	j	                  ||| j
                  | j                        j                  dd      }	|
j	                  ||| j
                  | j                        j                  dd      }
| j                  j                  dk7  r||||z   }n||}n	|du| _
        t        }| j                  j                  dk7  rN| j                  j                  dk(  r|rt        j                  d       nt        | j                  j                     } || ||	|
|| j                  | j                  | j                   sdn| j"                  	      \  }}|j%                  |||      j'                         }| j)                  |      }|sd}||fS )
z#Input shape: Batch x Time x Channelr   rb   flash_attention_2Neagersdpaz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.        )r   r   r   )r   r   r   r   r   r   r   r   r[   _attn_implementationr   r   loggerwarning_oncer   r   r   r   r   r   r   )r?   rP   r   r   r   r   r   rk   queriesrD   valuesattention_interfacer   r   s                 r%   r   zCLIPSegAttention.forward1  s    -:,?,?)
J	++m,{{=)]+,,z:t~~t}}U__`acdeyyZOYYZ[]^_ZT^^T]]S]]^_abc ;;++/BB).C.O!/2G!G&2!62$>DN(?;;++w6{{//69>O##L
 '>dkk>^>^&_#$7nnJJ#}}C$,,	%
!\ "))*j)LWWYmmK0 LL((r'   )NNF)rG   rH   rI   rJ   r   r   r   ri   r"   r   r   boolrC   r   r   r   s   @r%   r   r     s    GBu%8:K%KL B. 268<,16)||6) !.6)  (5	6)
 $D>6) 
u||Xell33	46)r'   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )
CLIPSegMLPc                    t         |           || _        t        |j                     | _        t        j                  |j                  |j                        | _
        t        j                  |j                  |j                        | _        y N)rh   ri   r[   r
   
hidden_actactivation_fnr   r   rj   intermediate_sizefc1fc2rz   s     r%   ri   zCLIPSegMLP.__init__l  sd    #F$5$5699V//1I1IJ99V55v7I7IJr'   rP   r   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r   r   r   )r?   rP   s     r%   r   zCLIPSegMLP.forwards  s4    /**=9/r'   )rG   rH   rI   ri   r"   r   r   r   r   s   @r%   r   r   k  s$    KU\\ ell r'   r   c                        e Zd Zdef fdZ	 d	dej                  dej                  dej                  dee   de	ej                     f
dZ xZS )
CLIPSegEncoderLayerr[   c                 D   t         |           |j                  | _        t	        |      | _        t        j                  | j                  |j                        | _	        t        |      | _        t        j                  | j                  |j                        | _        y N)epsrh   ri   rj   rk   r   	self_attnr   	LayerNormlayer_norm_epslayer_norm1r   mlplayer_norm2rz   s     r%   ri   zCLIPSegEncoderLayer.__init__|  m    ++)&1<<F<Q<QRf%<<F<Q<QRr'   rP   r   r   r   r   c                     |}| j                  |      }| j                  ||||      \  }}||z   }|}| j                  |      }| j                  |      }||z   }|f}|r||fz  }|S aI  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )rP   r   r   r   )r   r   r   r   r?   rP   r   r   r   residualr   outputss           r%   r   zCLIPSegEncoderLayer.forward  s    " !((7&*nn')"7/	 '5 '
#| !=0 ((7/ =0 "&Gr'   F)rG   rH   rI   r   ri   r"   r   r   r   rC   rK   r   r   r   s   @r%   r   r   {  sf    S} S -2&||& &  %||	&
 $D>& 
u  	!&r'   r   c                   &    e Zd ZU eed<   dZdZd Zy)CLIPSegPreTrainedModelr[   clipTc                 L
   | j                   j                  }t        |t              rj|j                  j
                  j                  j                  d|dz         |j                  j
                  j                  j                  d|dz         nt        |t              r| j                   j                  }t        j                  j                  |j                  d|j                  dz  |z         t        j                  j                  |j                  j
                  |j                   j                  |z         t        j                  j                  |j                  j
                  |j                   j                  |z         nt        |t               r-| j                   j                  }|j                  dz  d|j                   j"                  z  dz  z  |z  }|j                  dz  |z  }t        j                  j                  |j$                  j
                  |       t        j                  j                  |j&                  j
                  |       t        j                  j                  |j(                  j
                  |       t        j                  j                  |j*                  j
                  |       nt        |t,              r| j                   j                  }|j                   j.                  dz  d|j                   j"                  z  dz  z  |z  }d|j                   j.                  z  dz  |z  }t        j                  j                  |j0                  j
                  |       t        j                  j                  |j2                  j
                  |       nt        |t4              rt        j                  j                  |j6                  j
                  |j8                  dz  | j                   j                  z         t        j                  j                  |j:                  j
                  |j<                  dz  | j                   j                  z         t        |t        j>                        rI|j@                  j                  jC                          |j
                  j                  jE                  d       t        |t        jF                        r2|j@                  %|j@                  j                  jC                          yyy)	zInitialize the weightsr   g{Gz?)meanstdr   )r  rb   g      ?N)$r[   initializer_factor
isinstancer   r   r   datanormal_rw   rZ   r   initrp   rk   rs   initializer_ranger   num_hidden_layersr   r   r   r   r   rj   r   r   CLIPSegModeltext_projectiontext_embed_dimvisual_projectionvision_embed_dimr   ra   zero_fill_r   )r?   r   factorin_proj_stdout_proj_stdfc_stds         r%   _init_weightsz$CLIPSegPreTrainedModel._init_weights  s   //f34""))..66CVd]6S%%,,1199sQU9V 78[[33FGGOOF22&BRBRTXBX[aBaObGGOOF2299v}}?^?^ag?gOhGGOOF55<<&--BaBadjBjOk 01[[33F!++T1q6==;Z;Z7Z_c6cdgmmK",,d2f<LGGOOFMM00kOBGGOOFMM00kOBGGOOFMM00kOBGGOOFOO22OE
+[[33F!==44d:FMMDcDc@chl?lmpvvK&--333<vEFGGOOFJJ--6O:GGOOFJJ--;O?-GGOO&&--))4/$++2P2PP   GGOO((//++T1DKK4R4RR  
 fbll+KK""$MM$$S)fbii(V[[-DKK""$ .E(r'   N)rG   rH   rI   r   rL   base_model_prefixsupports_gradient_checkpointingr  rM   r'   r%   r  r    s    &*#'%r'   r  c                        e Zd ZdZdef fdZe	 	 	 	 	 ddeej                     deej                     dee
   dee
   dee
   d	eeef   fd
       Z xZS )CLIPSegEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`CLIPSegEncoderLayer`].

    Args:
        config: CLIPSegConfig
    r[   c                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w )NF)
rh   ri   r[   r   
ModuleListranger  r   layersgradient_checkpointing)r?   r[   r   r{   s      r%   ri   zCLIPSegEncoder.__init__  sP    mm%PVPhPhJi$jQ%8%@$jk&+# %ks   A#r   r   r   output_hidden_statesreturn_dictr   c                 j   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|rdnd}|rdnd}|}	t	        | j
                        D ]*  \  }
}|r||	fz   } ||	|||      }|d   }	|s"||d   fz   }, |r||	fz   }t        |	||      S )a  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Causal mask for the text model. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        NrM   )r   r   r   )last_hidden_staterP   rQ   )r[   r   r%  use_return_dict	enumerater#  r   )r?   r   r   r   r   r%  r&  encoder_statesall_attentionsrP   idxencoder_layerlayer_outputss                r%   r   zCLIPSegEncoder.forward  s    N 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]30d%"+DKK"8 	FC#!/=2B!B)%"3	M *!,M !/=3C2E!E	F  +}.>>N+>Vd
 	
r'   NNNNN)rG   rH   rI   rJ   r   ri   r   r   r"   r   r   r   rC   r   r   r   r   s   @r%   r  r    s    ,} ,  268<,0/3&*C
 !.C
  (5	C

 $D>C
 'tnC
 d^C
 
uo%	&C
 C
r'   r  c                        e Zd Zdef fdZe	 	 	 	 	 	 ddeej                     deej                     deej                     dee	   dee	   dee	   d	e
eef   fd
       Z xZS )CLIPSegTextTransformerr[   c                     t         |           || _        |j                  }t	        |      | _        t        |      | _        t        j                  ||j                        | _        |j                  | _        y r   )rh   ri   r[   rj   r   r|   r  encoderr   r   r   final_layer_normeos_token_idr   s      r%   ri   zCLIPSegTextTransformer.__init__5  sa    &&	/7%f- "YF<Q<Q R #//r'   r   r   rc   r   r%  r&  r   c                 Z   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|t	        d      |j                         }|j                  d|d         }| j                  ||      }t        ||j                  |j                        }	|t        ||j                        }| j                  |||	|||      }
|
d   }| j                  |      }| j                  dk(  rm|t        j                   |j"                  d   |j                        |j%                  t        j&                  |j                        j)                  d	      f   }n|t        j                   |j"                  d   |j                        |j%                  t        j&                  |j                        | j                  k(  j'                         j)                  d	      f   }|s
||f|
d
d  z   S t+        |||
j,                  |
j.                        S )NzYou have to specify input_idsre   )r   rc   r   )r   r   r   r   r%  r&  r   rb   )r   r   r   r   r(  pooler_outputrP   rQ   )r[   r   r%  r)  r   r   r   r|   r   r   r   r   r4  r5  r6  r"   r#   r   r   r   argmaxr   rP   rQ   )r?   r   r   rc   r   r%  r&  input_shaperP   r   encoder_outputsr(  rU   s                r%   r   zCLIPSegTextTransformer.forward@  s.    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]<==nn&NN2{27	),W !A,,]5I5I!
 %7H[H[\N,,')"7/!5# ' 
 ,A. 112CD! ..44Q7@Q@X@XY5995F5M5MNUUZ\U]_M ..44Q7@Q@X@XY EII6G6N6NOSWSdSddB!M %}58KKK)/')77&11	
 	
r'   NNNNNN)rG   rH   rI   r   ri   r   r   r"   r   r   r   rC   r   r   r   r   s   @r%   r2  r2  4  s    	00 	0  -115/3,0/3&*K
ELL)K
 !.K
 u||,	K

 $D>K
 'tnK
 d^K
 
u00	1K
 K
r'   r2  c                       e Zd ZU eed<   ddgZdef fdZdej                  fdZ	d Z
e	 	 	 	 	 	 ddeej                     d	eej                     d
eej                     dee   dee   dee   deeef   fd       Z xZS )CLIPSegTextModelr[   r   r   c                 d    t         |   |       t        |      | _        | j	                          y r   )rh   ri   r2  
text_model	post_initrz   s     r%   ri   zCLIPSegTextModel.__init__  s&     08r'   r   c                 B    | j                   j                  j                  S r   rA  r|   r   rE   s    r%   get_input_embeddingsz%CLIPSegTextModel.get_input_embeddings  s    ))999r'   c                 :    || j                   j                  _        y r   rD  )r?   r   s     r%   set_input_embeddingsz%CLIPSegTextModel.set_input_embeddings  s    5:""2r'   r   r   rc   r   r%  r&  c                 0    | j                  ||||||      S )a;  
        Examples:

        ```python
        >>> from transformers import AutoTokenizer, CLIPSegTextModel

        >>> tokenizer = AutoTokenizer.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegTextModel.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
        ```r   r   rc   r   r%  r&  )rA  )r?   r   r   rc   r   r%  r&  s          r%   r   zCLIPSegTextModel.forward  s,    2 )%/!5#  
 	
r'   r=  )rG   rH   rI   r   rL   _no_split_modulesri   r   ModulerE  rG  r   r   r"   r   r   r   rC   r   r   r   r   s   @r%   r?  r?    s    02GH0 :bii :;  -115/3,0/3&*
ELL)
 !.
 u||,	

 $D>
 'tn
 d^
 
u00	1
 
r'   r?  c                        e Zd Zdef fdZe	 	 	 	 d
deej                     dee	   dee	   dee	   dee	   de
eef   fd	       Z xZS )CLIPSegVisionTransformerr[   c                     t         |           || _        |j                  }t	        |      | _        t        j                  ||j                        | _	        t        |      | _        t        j                  ||j                        | _        y r   )rh   ri   r[   rj   rZ   r|   r   r   r   pre_layrnormr  r4  post_layernormr   s      r%   ri   z!CLIPSegVisionTransformer.__init__  sj    &&	1&9LL8M8MN%f- ll9&:O:OPr'   r   r   r%  r&  r   r   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j	                  ||      }| j                  |      }| j                  ||||      }|d   }|d d dd d f   }	| j                  |	      }	|s
||	f|dd  z   S t        ||	|j                  |j                        S )N)r   )r   r   r%  r&  r   r   r8  )r[   r   r%  r)  r|   rO  r4  rP  r   rP   rQ   )
r?   r   r   r%  r&  r   rP   r<  r(  rU   s
             r%   r   z CLIPSegVisionTransformer.forward  s     2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]Ogh))-8,,'/!5#	 ' 
 ,A.)!Q'2++M:%}58KKK)/')77&11	
 	
r'   )NNNT)rG   rH   rI   r   ri   r   r   r"   rK   r   r   rC   r   r   r   r   s   @r%   rM  rM    s    Q2 Q  -1/3&*37$
u001$
 $D>$
 'tn	$

 d^$
 #+4.$
 
u00	1$
 $
r'   rM  c                        e Zd ZU eed<   dZdef fdZdej                  fdZ	e
	 	 	 	 	 ddeej                     dee   dee   dee   d	ee   deeef   fd
       Z xZS )CLIPSegVisionModelr[   r   c                 d    t         |   |       t        |      | _        | j	                          y r   )rh   ri   rM  vision_modelrB  rz   s     r%   ri   zCLIPSegVisionModel.__init__  s'     4V<r'   r   c                 B    | j                   j                  j                  S r   )rU  r|   rs   rE   s    r%   rE  z'CLIPSegVisionModel.get_input_embeddings  s      ++;;;r'   r   r%  r   r&  c                 .    | j                  |||||      S )a  
        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, CLIPSegVisionModel

        >>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegVisionModel.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled CLS states
        ```r   r   r%  r   r&  )rU  )r?   r   r   r%  r   r&  s         r%   r   zCLIPSegVisionModel.forward  s+    :   %/!5%=# ! 
 	
r'   NNNTN)rG   rH   rI   r   rL   main_input_nameri   r   rK  rE  r   r   r"   rK   r   r   rC   r   r   r   r   s   @r%   rS  rS    s    $O2 <bii <  59,0/337&*"
u001"
 $D>"
 'tn	"

 #+4."
 d^"
 
u00	1"
 "
r'   rS  c                   .    e Zd ZU eed<   def fdZe	 	 	 	 	 	 ddeej                     deej                     deej                     dee
   dee
   dee
   d	ej                  fd
       Ze	 	 	 	 	 ddeej                     dee
   dee
   de
dee
   d	ej                  fd       Ze	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     dee
   dee
   dee
   de
dee
   d	eeef   fd       Z xZS )r  r[   c                    t         |   |       t        |j                  t              s"t        dt        |j                         d      t        |j                  t              s"t        dt        |j                         d      |j                  }|j                  }|j                  |_	        |j                  |_	        |j                  | _
        |j                  | _        |j                  | _        t        |      | _        t!        |      | _        t%        j&                  | j                  | j                  d      | _        t%        j&                  | j                  | j                  d      | _        t%        j,                  t/        j0                  | j2                  j4                              | _        | j9                          y )NzNconfig.text_config is expected to be of type CLIPSegTextConfig but is of type .zRconfig.vision_config is expected to be of type CLIPSegVisionConfig but is of type F)ra   )rh   ri   r
  text_configr   	TypeErrortypevision_configr   r   projection_dimrj   r  r  r2  rA  rM  rU  r   r   r  r  rn   r"   tensorr[   logit_scale_init_valuelogit_scalerB  )r?   r[   r^  ra  r{   s       r%   ri   zCLIPSegModel.__init__.  ss    &,,.?@++,-Q0 
 &..0CD--./q2 
 ((,,+1+F+F(-3-H-H*$33)55 - 9 90=4]C!#4+@+@$BUBU\a!b!yy)<)<d>Q>QX]^<<T[[5W5W(XY 	r'   r   r   rc   r   r%  r&  r   c                     ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j	                  ||||||      }|d   }| j                  |      }	|	S )a  
        Returns:
            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
            applying the projection layer to the pooled output of [`CLIPSegTextModel`].

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, CLIPSegModel

        >>> tokenizer = AutoTokenizer.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegModel.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
        >>> text_features = model.get_text_features(**inputs)
        ```rI  r   )r[   r   r%  r)  rA  r  )
r?   r   r   rc   r   r%  r&  text_outputsrU   text_featuress
             r%   get_text_featureszCLIPSegModel.get_text_featuresR  s    6 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B])%/!5# ' 
 %Q,,];r'   r   r   c                     ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j	                  |||||      }|d   }| j                  |      }|S )aI  
        Returns:
            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
            applying the projection layer to the pooled output of [`CLIPSegVisionModel`].

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, CLIPSegModel

        >>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegModel.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> image_features = model.get_image_features(**inputs)
        ```rX  r   )r[   r   r%  r)  rU  r  )	r?   r   r   r%  r   r&  vision_outputsrU   image_featuress	            r%   get_image_featureszCLIPSegModel.get_image_features  s    @ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]**%/!5%=# + 
 'q)//>r'   return_lossc
           	         ||n| j                   j                  }||n| j                   j                  }|	|	n| j                   j                  }	| j	                  |||||	      }
| j                  ||||||	      }|
d   }| j                  |      }|d   }| j                  |      }||j                  ddd      z  }||j                  ddd      z  }| j                  j                         }t        j                  ||j                               |z  }|j                         }d}|rt        |      }|	s||||||
f}||f|z   S |S t        |||||||
	      S )
a  
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, CLIPSegModel

        >>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegModel.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(
        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
        ... )

        >>> outputs = model(**inputs)
        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
        ```NrX  rI  r   rb   re   T)r   r   keepdim)r0   r1   r2   r3   r4   r5   r6   )r[   r   r%  r)  rU  rA  r  r  normre  expr"   r   r*   r-   r/   )r?   r   r   r   rc   rn  r   r%  r   r&  rk  rg  r4   r3   re  r2   r1   r0   outputs                      r%   r   zCLIPSegModel.forward  s   N 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]**%/!5%=# + 
 )%/!5# ' 
 &a(--l;"1o**;7 $l&7&7!T&7&RR!K$4$4qb$$4$OO &&**,,,{LNN4DES*,,.0D&lT`bpqF)-)9TGf$EvE-+#%* .
 	
r'   r=  rY  )	NNNNNNNTN)rG   rH   rI   r   rL   ri   r   r   r"   r   r   rK   ri  rm  r   r   rC   r/   r   r   r   s   @r%   r  r  *  s   "} "H  -115/3,0/3&*,ELL), !., u||,	,
 $D>, 'tn, d^, 
		, ,\  59,0/3)-&*0u0010 $D>0 'tn	0
 #'0 d^0 
		0 0d  15481537&*,0/3)-&*\
E,,-\
 u001\
 !.	\

 u//0\
 d^\
 $D>\
 'tn\
 #'\
 d^\
 
um#	$\
 \
r'   r  c                        e Zd ZdZdef fdZ	 d
dej                  dej                  dej                  dee	   de
ej                     f
d	Z xZS )CLIPSegDecoderLayerz
    CLIPSeg decoder layer, which is identical to `CLIPSegEncoderLayer`, except that normalization is applied after
    self-attention/MLP, rather than before.
    r[   c                 D   t         |           |j                  | _        t	        |      | _        t        j                  | j                  |j                        | _	        t        |      | _        t        j                  | j                  |j                        | _        y r   r   rz   s     r%   ri   zCLIPSegDecoderLayer.__init__  r   r'   rP   r   r   r   r   c                     |}| j                  ||||      \  }}||z   }| j                  |      }|}| j                  |      }||z   }| j                  |      }|f}|r||fz  }|S r   )r   r   r   r   r   s           r%   r   zCLIPSegDecoderLayer.forward#  s    " !&*nn')"7/	 '5 '
#| !=0((7 / =0((7 "&Gr'   r  )rG   rH   rI   rJ   r   ri   r"   r   r   r   rC   rK   r   r   r   s   @r%   ru  ru    sk    S} S -2'||' '  %||	'
 $D>' 
u  	!'r'   ru  c                        e Zd Zdef fdZ	 	 	 d	deej                     dej                  dee	   dee	   dee	   f
dZ
 xZS )
CLIPSegDecoderr[   c                    t         |   |       |j                  | _        t        j                  |j
                  |j                        | _        t        j                  |j
                  |j                        | _        |j                  r|j                  j                  dz  |j                  j                  dz  f}t        j                  t        j                  |j                  |j                  dd      t        j                         t        j                  |j                  |j                  dz  |d   |d         t        j                         t        j                  |j                  dz  d|d   |d               | _        nPt        j                  |j                  d|j                  j                  |j                  j                        | _        t#        |j$                        }t        j&                  t)        |      D cg c]6  }t        j                  |j                  j*                  |j                        8 c}      | _        t/        j0                  |j                        }|j                  |_        |j2                  |_        |j6                  |_        d	|_        t        j&                  t)        t#        |j$                              D cg c]  }t=        |       c}      | _        y c c}w c c}w )
N   r	   r   )r_   paddingrb   r   )r_   r`   )r`   relu) rh   ri   conditional_layerr   r   rb  
reduce_dimfilm_mulfilm_add"use_complex_transposed_convolutionra  rm   
Sequentialrq   ReLUConvTranspose2dtransposed_convolutionr$   extract_layersr!  r"  rj   reducescopydeepcopydecoder_num_attention_headsr   decoder_intermediate_sizer   r   ru  r#  )r?   r[   transposed_kernelsdepthr   decoder_configr{   s         r%   ri   zCLIPSegDecoder.__init__N  sQ    !'!9!9		&"7"79J9JK		&"7"79J9JK44"("6"6"A"AQ"FH\H\HgHgklHl!m*,--		&++V->->AWXY	""%%%%* 21 5-a0	 	""%%*A;Ma;PYklmYn+D' +-*<*<!!1f&:&:&E&EfNbNbNmNm+D' F))*}}UZ[`UabPQRYYv++779J9JKb
 v';';<%+%6%6"-3-O-O*+1+K+K($*!mmRWX[\b\q\qXrRs$tQ%8%H$tu c %us   ;K9K>rP   rT   r   r%  r&  c                 @   |rdnd }|rdnd }|d d d   }d }	t        t        || j                  | j                              D ]  \  }
\  }}}|	 ||      |	z   }	n ||      }	|
| j                  k(  rJ| j                  |      |	j                  ddd      z  | j                  |      z   }	|	j                  ddd      }	 ||	d d |      }|d   }	|r||	fz  }|s||d   fz  } |	d d dd d d f   j                  ddd      }	t        t        j                  |	j                  d               }|j                  d   }|	j                  ||	j                  d   ||      }	| j                  |	      j                  d      }|st        d |||fD              S t!        |||      S )	NrM   re   r   r   rb   )r   r   r   c              3   &   K   | ]	  }||  y wr   rM   )r=   vs     r%   r@   z)CLIPSegDecoder.forward.<locals>.<genexpr>  s     aqSTS`as   )r   rP   rQ   )r*  zipr#  r  r~  r  r   r  r   mathsqrtr   r   r  squeezerC   rO   )r?   rP   rT   r   r%  r&  all_hidden_statesr,  activationsrs  i
activationlayerreducer/  r   r   r   s                     r%   r   zCLIPSegDecoder.forwardx  s    #7BD0d#DbD).7KVZVbVb8c.d 	6*A*
E6!
+f4
+D***'=>PQSTVWAXX[_[h[h*\   1a0!t4[lM #1%F#!fY.! =#3"55-	60 12q!))!Q2499V\\!_-.+11!4
Za$E,,V4<<Q?aV->$Oaaa#+%
 	
r'   )NNT)rG   rH   rI   r   ri   rC   r"   r   r   r   r   r   r   s   @r%   ry  ry  M  sk    (v} (v\ -1/3&*6
U\\*6
 !&6
 $D>	6

 'tn6
 d^6
r'   ry  zn
    CLIPSeg model with a Transformer-based decoder on top for zero-shot and one-shot image segmentation.
    )custom_introc                       e Zd ZU eed<   def fdZ	 	 	 	 	 ddee   deej                     deej                     deej                     deej                     f
dZ
e	 	 	 	 	 	 	 	 	 	 	 ddeej                     d	eej                     deej                     d
eej                     deej                     deej                     deej                     dee   dee   dedee   deeef   fd       Z xZS )CLIPSegForImageSegmentationr[   c                     t         |   |       || _        t        |      | _        |j
                  | _        t        |      | _        | j                          y r   )	rh   ri   r[   r  r  r  ry  decoderrB  rz   s     r%   ri   z$CLIPSegForImageSegmentation.__init__  sI      (	$33%f- 	r'   r   r   r   rc   conditional_pixel_valuesc                    |Vt        |      |k7  rt        d      t        j                         5  | j                  j                  |||      }d d d        |S |St        |      |k7  rt        d      t        j                         5  | j                  j                  |      }d d d        |S t        d      # 1 sw Y   S xY w# 1 sw Y   S xY w)Nz@Make sure to pass as many prompt texts as there are query images)r   rc   zAMake sure to pass as many prompt images as there are query imagesz[Invalid conditional, should be either provided as `input_ids` or `conditional_pixel_values`)r$   r   r"   no_gradr  ri  rm  )r?   r   r   r   rc   r  rT   s          r%   get_conditional_embeddingsz6CLIPSegForImageSegmentation.get_conditional_embeddings  s      9~+ !cdd )-)D)Dn< *E *& &% &1+,
: !dee `)-)E)EF^)_&` &%	 m  &%` &%s   B9C9CCr   rT   labelsr   r%  r   r&  r   c                 "   ||n| j                   j                  }t        j                         5  | j                  j                  ||d|
|      }| j                  j                  |d         }|r|j                  n|d   }| j                  D cg c]
  }||dz       }}|r<t        |j                  |j                  |	r|j                  nd|j                        }n|	s|dd |dd z   n|}ddd       |$| j                  |j                  d   ||||	      }n[|j                  d   |j                  d   k7  rt        d
      |j                  d   | j                   j                   k7  rt        d      | j#                  |||	|      }|r|j$                  n|d   }d}|8|j'                  |j(                        }t+        j,                         } |||      }|s|||f}||f|z   S |S t/        ||||      S c c}w # 1 sw Y   xY w)aX  
        conditional_pixel_values (`torch.FloatTensor`, *optional*):
            The pixel values of the conditional images.
        conditional_embeddings (`torch.FloatTensor` of shape `(batch_size, config.projection_dim)`, *optional*):
            The conditional embeddings for the query images. If provided, the model will use this instead of computing
            the embeddings from the conditional_pixel_values.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Examples:

        ```python
        >>> from transformers import AutoProcessor, CLIPSegForImageSegmentation
        >>> from PIL import Image
        >>> import requests

        >>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegForImageSegmentation.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> texts = ["a cat", "a remote", "a blanket"]
        >>> inputs = processor(text=texts, images=[image] * len(texts), padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)

        >>> logits = outputs.logits
        >>> print(logits.shape)
        torch.Size([3, 352, 352])
        ```NTrX  r   rb   r8  r	   r   )r   r   r   rc   r  zWMake sure to pass as many conditional embeddings as there are query images in the batchzcMake sure that the feature dimension of the conditional embeddings matches `config.projection_dim`.)r   r%  r&  )r0   r   rT   rU   r6   rV   )r[   r)  r"   r  r  rU  r  rP   r  r   r(  r9  rQ   r  r   r   rb  r  r   r   r   r   BCEWithLogitsLossrS   )r?   r   r   r  rT   r   rc   r  r   r%  r   r&  rk  rU   rP   r  r  decoder_outputsr   r0   loss_fnrs  s                         r%   r   z#CLIPSegForImageSegmentation.forward  sn   ^ &1%<k$++B]B] ]]_ 	!YY33)"3%))A' 4 N !II77q8IJM<GN88^\]M^M9=9L9LMA=Q/MKM !;&4&F&F"0">">BV.">">\`-88	" DXN2A&);;]k /	8 ")%)%D%D'--a0#-))A &E &" &++A.,2D2DQ2GG m  &++A.$++2L2LL 0  ,,"/!5# ' 
 ,7''OA<NYYv}}-F**,G66*D4m^UdeF)-)9TGf$EvE-#9' .*
 	
q N	 	s   A HG?AH?HHr0  )NNNNNNNNNTN)rG   rH   rI   r   rL   ri   r   r   r"   r   r  r   rK   r   r   r   rC   r/   r   r   r   s   @r%   r  r    s    }  %),015/3;?&SM& ELL)& !.	&
 u||,& #+5<<"8&:  2648@D>B1537-1,0/3)-&*|
E--.|
 u001|
 #+5+<+<"=	|

 !)):): ;|
 !.|
 u//0|
 ))*|
 $D>|
 'tn|
 #'|
 d^|
 
um#	$|
 |
r'   r  )r  r  r?  rS  r  )r   )@rJ   r  r  dataclassesr   typingr   r   r   r   r"   torch.utils.checkpointr   r  r
   modeling_attn_mask_utilsr   r   modeling_layersr   modeling_outputsr   r   modeling_utilsr   r   utilsr   r   r   r   r   configuration_clipsegr   r   r   
get_loggerrG   r   r   r&   r-   r/   rO   rS   rK  rZ   r   floatr   r   r   r   r  r  r2  r?  rM  rS  r  ru  ry  r  __all__rM   r'   r%   <module>r     s      ! 1 1    ! d 9 K F V V X X 
		H	%
`U\\ `ell `
-U\\ -ell -  
K  
   
F :; :  : 
[ 
  
<Pbii Ph%BII %` %II%<<% 
% <<	%
 U\\*% % %.M)ryy M)b  /4 /d ,%_ ,% ,%`S
RYY S
lX
RYY X
v1
- 1
h1
ryy 1
h0
/ 0
f f
) f
 f
R6")) 6ra
+ a
H 
j
"8 j

j
Zr'   