
    rh                        d dl Z d dlmZmZ d dlZd dlZd dlmZmZmZ d dl	m
Z
 ddlmZ ddlmZmZmZ ddlmZ dd	lmZ dd
lmZmZmZ ddlmZmZmZmZ ddlmZ ddl m!Z!  ejD                  e#      Z$ G d dejJ                        Z& G d dejJ                        Z' G d dejJ                        Z( G d dejJ                        Z) G d dejJ                        Z* G d dejJ                        Z+ G d de      Z, G d dejJ                        Z- G d d ejJ                        Z. G d! d"ejJ                        Z/ G d# d$ejJ                        Z0 G d% d&ejJ                        Z1 G d' d(e      Z2 G d) d*e2      Z3 G d+ d,e2e      Z4g d-Z5y).    N)OptionalUnion)Tensordevicenn)CrossEntropyLoss   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)GradientCheckpointingLayer))BaseModelOutputWithPastAndCrossAttentions,BaseModelOutputWithPoolingAndCrossAttentions!CausalLMOutputWithCrossAttentions)PreTrainedModelapply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)logging   )BlipTextConfigc                        e Zd ZdZ fdZ	 	 	 	 d	deej                     deej                     deej                     de	dej                  f
dZ xZS )
BlipTextEmbeddingsz;Construct the embeddings from word and position embeddings.c                 P   t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  |j
                        | _	        t        j                  |j
                  |j                        | _
        t        j                  |j                        | _        | j                  dt!        j"                  |j                        j%                  d      d       t'        |dd      | _        || _        y )	N)padding_idxepsposition_ids)r   F)
persistentposition_embedding_typeabsolute)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_buffertorcharangeexpandgetattrr#   configselfr8   	__class__s     ~/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/blip/modeling_blip_text.pyr&   zBlipTextEmbeddings.__init__3   s    !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c  f&8&8f>S>STzz&"<"<= 	ELL)G)GHOOPWXej 	 	
 (/v7PR\']$    	input_idsr    inputs_embedspast_key_values_lengthreturnc                 H   ||j                         }n|j                         d d }|d   }|| j                  d d |||z   f   }|| j                  |      }|}| j                  dk(  r| j	                  |      }||z  }| j                  |      }| j                  |      }|S )Nr!   r   r$   )sizer    r+   r#   r-   r.   r2   )	r:   r>   r    r?   r@   input_shape
seq_length
embeddingsr-   s	            r<   forwardzBlipTextEmbeddings.forwardE   s      #..*K',,.s3K ^
,,Q0FVlIl0l-lmL  00;M"
'':5"&":":<"H--J^^J/
\\*-
r=   )NNNr   )__name__
__module____qualname____doc__r&   r   r4   
LongTensorFloatTensorintr   rG   __classcell__r;   s   @r<   r   r   0   ss    E( 153759&'E,,- u//0   1 12	
 !$ 
r=   r   c                   0    e Zd Zd fd	Zd Zd Zd Zd Z	 	 	 	 	 	 	 ddej                  de
ej                     de
ej                     d	e
ej                     d
e
ej                     de
e   de
e   de
ej                     deej                     fdZ xZS )BlipTextSelfAttentionc                    t         |           || _        |j                  |j                  z  dk7  r0t        |d      s$t        d|j                  |j                  fz        |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _	        || _
        t        j                  |j                  | j                        | _        |r_t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        n^t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j"                  |j$                        | _        t)        |dd      | _        | j*                  dk(  s| j*                  dk(  rG|j,                  | _        t        j.                  d|j,                  z  d	z
  | j                        | _        y y )
Nr   embedding_sizezLThe hidden size (%d) is not a multiple of the number of attention heads (%d)r#   r$   relative_keyrelative_key_query   r   )r%   r&   r8   r)   num_attention_headshasattr
ValueErrorrN   attention_head_sizeall_head_size	layer_idxr   Linearqueryencoder_hidden_sizekeyvaluer0   attention_probs_dropout_probr2   r7   r#   r,   r'   distance_embeddingr:   r8   is_cross_attentionr]   r;   s       r<   r&   zBlipTextSelfAttention.__init__e   s    : ::a?PVXhHi^%%v'A'ABC 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PP"YYv1143E3EF
yy!;!;T=O=OPDH6#=#=t?Q?QRDJyy!3!3T5G5GHDH6#5#5t7I7IJDJzz&"E"EF'.v7PR\']$''>9T=Y=Y]q=q+1+I+ID(&(ll1v7U7U3UXY3Y[_[s[s&tD# >rr=   c                     || _         y Nattn_gradients)r:   rj   s     r<   save_attn_gradientsz)BlipTextSelfAttention.save_attn_gradients   s
    ,r=   c                     | j                   S rh   ri   r:   s    r<   get_attn_gradientsz(BlipTextSelfAttention.get_attn_gradients   s    """r=   c                     || _         y rh   attention_map)r:   rq   s     r<   save_attention_mapz(BlipTextSelfAttention.save_attention_map   s
    *r=   c                     | j                   S rh   rp   rm   s    r<   get_attention_mapz'BlipTextSelfAttention.get_attention_map   s    !!!r=   hidden_statesattention_mask	head_maskencoder_hidden_statesencoder_attention_maskpast_key_valueoutput_attentionscache_positionrA   c	                    |j                   \  }	}
}| j                  |      j                  |	d| j                  | j                        j                  dd      }|d u}|r|n|}|St        |t              rA|j                  j                  | j                        }|r|j                  }n|j                  }n|}|r|n|}|rK|IrGj                  | j                     j                  }|j                  | j                     j                  }n| j!                  |      j                  |	d| j                  | j                        j                  dd      }| j#                  |      j                  |	d| j                  | j                        j                  dd      }|D|s|nd }j%                  ||| j                  d|i      \  }}|rd|j                  | j                  <   t'        j(                  ||j                  dd            }| j*                  dk(  s| j*                  dk(  rF|j-                         d   }
t'        j.                  |
t&        j0                  |j2                  	      j                  dd      }t'        j.                  |
t&        j0                  |j2                  	      j                  dd      }||z
  }| j5                  || j6                  z   dz
        }|j9                  |j:                  
      }| j*                  dk(  rt'        j<                  d||      }||z   }nE| j*                  dk(  r6t'        j<                  d||      }t'        j<                  d||      }||z   |z   }|t?        j@                  | j                        z  }|||j9                  |j2                        z   } tC        jD                  d      |      }| jG                  |      }|||z  }t'        j(                  ||      }|jI                  dddd      jK                         }|j-                         d d | jL                  fz   } |j                  | }||fS )Nr!   r   rW   r|   TrU   rV   )dtyper   r   zbhld,lrd->bhlrzbhrd,lrd->bhlrdimr   r	   )'shaper_   viewrX   r[   	transpose
isinstancer   
is_updatedgetr]   cross_attention_cacheself_attention_cachelayerskeysvaluesra   rb   updater4   matmulr#   rC   r5   longr   rd   r,   tor   einsummathsqrtr   Softmaxr2   permute
contiguousr\   )r:   ru   rv   rw   rx   ry   rz   r{   r|   
batch_sizerE   _query_layerrf   r   curr_past_key_valuecurrent_states	key_layervalue_layerattention_scoresposition_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyattention_probsattention_probs_droppedcontext_layernew_context_layer_shapes                                  r<   rG   zBlipTextSelfAttention.forward   s    %2$7$7!
JJJ}%T*b$":":D<T<TUYq!_ 	 3$>3E/>%.*=>+66::4>>J
%*8*N*N'*8*M*M'&4#2D.-."<+224>>BGGI-44T^^DKKK (j"d&>&>@X@XY1a  

>*j"d&>&>@X@XY1a  )7It)<)C)C{DNN=M~<^*&	; &@DN--dnn= !<<Y5H5HR5PQ''>9T=Y=Y]q=q&++-a0J"\\*EJJ}OcOcdiijlnopN"\\*EJJ}OcOcdiijkmopN%6H#'#:#:8dFbFb;bef;f#g #7#:#:ARAR#:#S ++~=+0<<8H+Wk+l(#36N#N --1EE16>NP[]q1r./4||<LiYm/n,#36T#TWs#s +dii8P8P.QQ%/.2C2CDTD[D[2\\ -"**,-=> #',,"?  &=	&I#%<kJ%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S***,CDo--r=   rh   NNNNNFN)rH   rI   rJ   r&   rk   rn   rr   rt   r4   r   r   rM   r   booltuplerG   rO   rP   s   @r<   rR   rR   d   s    u8-#+" 7;15=A>B*.,115g.||g. !!2!23g. E--.	g.
  ((9(9:g. !)):): ;g. !g. $D>g. !.g. 
u||	g.r=   rR   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )BlipTextSelfOutputc                 (   t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                        | _
        y Nr   )r%   r&   r   r^   r)   denser.   r/   r0   r1   r2   r9   s     r<   r&   zBlipTextSelfOutput.__init__   s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r=   ru   input_tensorrA   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S rh   r   r2   r.   r:   ru   r   s      r<   rG   zBlipTextSelfOutput.forward   7    

=1]3}|'CDr=   rH   rI   rJ   r&   r4   r   rG   rO   rP   s   @r<   r   r      1    >U\\  RWR^R^ r=   r   c                        e Zd Zd fd	Zd Z	 	 	 	 	 	 ddej                  deej                     deej                     deej                     dee	   dee
   d	eej                     d
eej                     fdZ xZS )BlipTextAttentionc                     t         |           t        |||      | _        t	        |      | _        t               | _        y )Nr]   )r%   r&   rR   r:   r   outputsetpruned_headsre   s       r<   r&   zBlipTextAttention.__init__  s6    )&2DPYZ	(0Er=   c                 >   t        |      dk(  ry t        || j                  j                  | j                  j                  | j
                        \  }}t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _	        t        | j                  j                  |d      | j                  _        | j                  j                  t        |      z
  | j                  _        | j                  j                  | j                  j                  z  | j                  _        | j
                  j                  |      | _        y )Nr   r   r   )lenr   r:   rX   r[   r   r   r_   ra   rb   r   r   r\   union)r:   headsindexs      r<   prune_headszBlipTextAttention.prune_heads  s   u:?749900$))2O2OQUQbQb
u
 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:r=   ru   rv   rw   rx   rz   r{   r|   rA   c           	      r    | j                  |||||||      }| j                  |d   |      }	|	f|dd  z   }
|
S )Nrv   rw   rx   rz   r{   r|   r   r   )r:   r   )r:   ru   rv   rw   rx   rz   r{   r|   self_outputsattention_outputoutputss              r<   rG   zBlipTextAttention.forward   s\     yy)"7)/) ! 
  ;;|AF#%QR(88r=   )FN)NNNNFN)rH   rI   rJ   r&   r   r4   r   r   rM   r   r   r   rG   rO   rP   s   @r<   r   r     s    ";* 7;15=A*.,115|| !!2!23 E--.	
  ((9(9: ! $D> !. 
u||	r=   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )BlipTextIntermediatec                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y rh   )r%   r&   r   r^   r)   intermediate_sizer   r   
hidden_actstrr
   intermediate_act_fnr9   s     r<   r&   zBlipTextIntermediate.__init__:  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r=   ru   rA   c                 J    | j                  |      }| j                  |      }|S rh   )r   r   r:   ru   s     r<   rG   zBlipTextIntermediate.forwardB  s&    

=100?r=   r   rP   s   @r<   r   r   9  s#    9U\\ ell r=   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )BlipTextOutputc                 (   t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        y r   )r%   r&   r   r^   r   r)   r   r.   r/   r0   r1   r2   r9   s     r<   r&   zBlipTextOutput.__init__J  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r=   ru   r   rA   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S rh   r   r   s      r<   rG   zBlipTextOutput.forwardP  r   r=   r   rP   s   @r<   r   r   I  r   r=   r   c                       e Zd Z fdZ	 	 	 	 	 	 	 ddej
                  deej                     deej                     deej                     deej                     dee   dee	   d	eej
                     d
e
ej
                     fdZd Z xZS )BlipTextLayerc                 L   t         |           || _        |j                  | _        d| _        t        ||      | _        || _        | j                  j                  r't        || j                  j                  |      | _	        t        |      | _        t        |      | _        y )Nr   r   )rf   r]   )r%   r&   r8   chunk_size_feed_forwardseq_len_dimr   	attention	layer_num
is_decodercrossattentionr   intermediater   r   )r:   r8   r   r;   s      r<   r&   zBlipTextLayer.__init__X  s    '-'E'E$*6YG";;!!"34;;+A+AY#D 18$V,r=   ru   rv   rw   rx   ry   rz   r{   r|   rA   c	           	          | j                  ||||||      }	|	d   }
|	dd  }|%| j                  |
||||||      }|d   }
||dd  z   }t        | j                  | j                  | j
                  |
      }|f|z   S )N)rv   rw   r{   rz   r|   r   r   r   )r   r   r   feed_forward_chunkr   r   )r:   ru   rv   rw   rx   ry   rz   r{   r|   self_attention_outputsr   r   cross_attention_outputslayer_outputs                 r<   rG   zBlipTextLayer.forwardf  s     "&)/)) "0 "
 2!4(, ,&*&9&9 5#&;-"3- ': '#  7q9 7 ;;G0##T%A%A4CSCSUe
 ((r=   c                 L    | j                  |      }| j                  ||      }|S rh   )r   r   )r:   r   intermediate_outputr   s       r<   r   z BlipTextLayer.feed_forward_chunk  s,    "//0@A{{#68HIr=   r   )rH   rI   rJ   r&   r4   r   r   rM   r   r   r   rG   r   rO   rP   s   @r<   r   r   W  s    -" 7;15=A>B*.,115%)||%) !!2!23%) E--.	%)
  ((9(9:%) !)):): ;%) !%) $D>%) !.%) 
u||	%)Nr=   r   c                   d    e Zd Z fdZ	 	 	 	 	 	 	 	 	 	 ddej
                  deej                     deej                     deej                     deej                     deeeej                           dee	   d	ee	   d
ee	   dee	   deej
                     de
eej
                     ef   fdZ xZS )BlipTextEncoderc           	          t         |           || _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        d| _	        y c c}w )NF)
r%   r&   r8   r   
ModuleListrangenum_hidden_layersr   layergradient_checkpointing)r:   r8   ir;   s      r<   r&   zBlipTextEncoder.__init__  sP    ]]eFLdLdFe#fM&!$<#fg
&+# $gs   A$ru   rv   rw   rx   ry   past_key_values	use_cacher{   output_hidden_statesreturn_dictr|   rA   c                    | j                   r%| j                  r|rt        j                  d       d}d}|rat	        |t
              s-t        j                  d       d}t        j                  |      }n$t	        |t              rt        |t                     }|	rdnd }|rdnd }|r|dnd }t        | j                  j                        D ]N  }| j                  |   }|	r||fz   }|||   nd } |||||||||      }|d   }|s:||d   fz   }|F||d   fz   }P |	r||fz   }|r|j                         }|
st        d	 |||||fD              S t!        |||||
      S )NzZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...FzPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.T r   r   rW   c              3   $   K   | ]  }|| 
 y wrh   r   ).0vs     r<   	<genexpr>z*BlipTextEncoder.forward.<locals>.<genexpr>  s      
 = 
s   )last_hidden_stater   ru   
attentionscross_attentions)r   trainingloggerwarningr   r   warning_oncer   from_legacy_cacher   r   r8   r   r   to_legacy_cacher   r   )r:   ru   rv   rw   rx   ry   r   r   r{   r   r   r|   return_legacy_cacheall_hidden_statesall_self_attentionsall_cross_attentionsr   layer_modulelayer_head_masklayer_outputss                       r<   rG   zBlipTextEncoder.forward  s    &&4==p "	#ou5##`
 '+#"5"G"G"X O\:"5o|~"V"6BD$5b4%6;P;\rbft{{445 	VA::a=L#$58H$H!.7.CilO(%&!	M *!,M &9]1=M<O&O#(4+?=QRCSBU+U(-	V0   1]4D D-==?O 
 "#%'(
 
 
 9+++*1
 	
r=   )
NNNNNNFFTN)rH   rI   rJ   r&   r4   r   r   rM   r   r   r   r   rG   rO   rP   s   @r<   r   r     s"   , 7;15=A>BEI$(,1/4&*15X
||X
 !!2!23X
 E--.	X

  ((9(9:X
 !)):): ;X
 "%e.?.?(@"ABX
 D>X
 $D>X
 'tnX
 d^X
 !.X
 
uU\\"$MM	NX
r=   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )BlipTextPoolerc                     t         |           t        j                  |j                  |j                        | _        t        j                         | _        y rh   )r%   r&   r   r^   r)   r   Tanh
activationr9   s     r<   r&   zBlipTextPooler.__init__  s9    YYv1163E3EF
'')r=   ru   rA   c                 \    |d d df   }| j                  |      }| j                  |      }|S )Nr   )r   r  )r:   ru   first_token_tensorpooled_outputs       r<   rG   zBlipTextPooler.forward  s6     +1a40

#566r=   r   rP   s   @r<   r  r    s#    $
U\\ ell r=   r  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )BlipTextPredictionHeadTransformc                 h   t         |           t        j                  |j                  |j                        | _        t        |j                  t              rt        |j                     | _
        n|j                  | _
        t        j                  |j                  |j                        | _        y r   )r%   r&   r   r^   r)   r   r   r   r   r
   transform_act_fnr.   r/   r9   s     r<   r&   z(BlipTextPredictionHeadTransform.__init__  s{    YYv1163E3EF
f''-$*6+<+<$=D!$*$5$5D!f&8&8f>S>STr=   ru   rA   c                 l    | j                  |      }| j                  |      }| j                  |      }|S rh   )r   r  r.   r   s     r<   rG   z'BlipTextPredictionHeadTransform.forward  s4    

=1--m<}5r=   r   rP   s   @r<   r  r    s$    UU\\ ell r=   r  c                   *     e Zd Z fdZd Zd Z xZS )BlipTextLMPredictionHeadc                 H   t         |           t        |      | _        t	        j
                  |j                  |j                  d      | _        t	        j                  t        j                  |j                              | _        | j                  | j                  _        y )NF)bias)r%   r&   r  	transformr   r^   r)   r(   decoder	Parameterr4   zerosr  r9   s     r<   r&   z!BlipTextLMPredictionHead.__init__  sm    8@ yy!3!3V5F5FUSLLV->->!?@	 !IIr=   c                 :    | j                   | j                  _         y rh   )r  r  rm   s    r<   _tie_weightsz%BlipTextLMPredictionHead._tie_weights'  s     IIr=   c                 J    | j                  |      }| j                  |      }|S rh   )r  r  r   s     r<   rG   z BlipTextLMPredictionHead.forward*  s$    }5]3r=   )rH   rI   rJ   r&   r#  rG   rO   rP   s   @r<   r  r    s    &&r=   r  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )BlipTextOnlyMLMHeadc                 B    t         |           t        |      | _        y rh   )r%   r&   r  predictionsr9   s     r<   r&   zBlipTextOnlyMLMHead.__init__2  s    3F;r=   sequence_outputrA   c                 (    | j                  |      }|S rh   )r(  )r:   r)  prediction_scoress      r<   rG   zBlipTextOnlyMLMHead.forward6  s     ,,_=  r=   r   rP   s   @r<   r&  r&  1  s#    <!u|| ! !r=   r&  c                   *    e Zd ZU dZeed<   dZg Zd Zy)BlipTextPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    r8   bertc                 .   t        |t        j                  t        j                  f      r<|j                  j
                  j                  d| j                  j                         nct        |t        j                        rI|j                  j
                  j                          |j                  j
                  j                  d       t        |t        j                        r2|j                  %|j                  j
                  j                          yyy)zInitialize the weightsg        )meanstd      ?N)r   r   r^   r'   weightdatanormal_r8   initializer_ranger.   r  zero_fill_)r:   modules     r<   _init_weightsz%BlipTextPreTrainedModel._init_weightsF  s    fryy",,78 MM&&CT[[5R5R&S-KK""$MM$$S)fbii(V[[-DKK""$ .E(r=   N)	rH   rI   rJ   rK   r   __annotations__base_model_prefix_no_split_modulesr:  r   r=   r<   r-  r-  <  s     
 
%r=   r-  c            #           e Zd ZdZd fd	Zd Zd Zd Zdede	e
   ded	ed
ef
dZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     deej                     deeej"                        dee   dee   dee   dee   d	ee   deej                     d
ee	ej                     ef   f dZ xZS )BlipTextModela&  
    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in [Attention is
    all you need](https://huggingface.co/papers/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin. argument and `is_decoder` set to `True`; an
    `encoder_hidden_states` is then expected as an input to the forward pass.
    c                     t         |   |       || _        t        |      | _        t        |      | _        |rt        |      nd | _        | j                          y rh   )
r%   r&   r8   r   rF   r   encoderr  pooler	post_init)r:   r8   add_pooling_layerr;   s      r<   r&   zBlipTextModel.__init__]  sI     ,V4&v.0AnV,tr=   c                 .    | j                   j                  S rh   rF   r+   rm   s    r<   get_input_embeddingsz"BlipTextModel.get_input_embeddingsg  s    ...r=   c                 &    || j                   _        y rh   rF  )r:   rb   s     r<   set_input_embeddingsz"BlipTextModel.set_input_embeddingsj  s    */'r=   c                     |j                         D ]7  \  }}| j                  j                  |   j                  j	                  |       9 y)z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsrA  r   r   r   )r:   heads_to_pruner   r   s       r<   _prune_headszBlipTextModel._prune_headsn  sE    
 +002 	CLE5LLu%//;;EB	Cr=   rv   rD   r   r   rA   c                     |j                         dk(  r|dddddddf   }n5|j                         dk(  r|r|\  }}t        j                  ||      }|ddddf   j                  ||d      |ddddf   k  }	|	j	                  |j
                        }	|	j                  d   |j                  d   k  r[|j                  d   |	j                  d   z
  }
t        j                  t        j                  |||
f||	j
                        |	gd      }	|	dddddddf   |ddddddf   z  }n*|ddddddf   }nt        d	| d
|j                   d      |j	                  | j
                        }d|z
  dz  }|S )a=  
        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.

        Arguments:
            attention_mask (`torch.Tensor`):
                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
            input_shape (`tuple[int]`):
                The shape of the input to the model.
            device (`torch.device`):
                The device of the input to the model.

        Returns:
            `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
        r	   NrW   r   r   )r   r   r!   )axisz!Wrong shape for input_ids (shape z) or attention_mask (shape )r   r2  g     )
r   r4   r5   repeatr   r   r   catonesrZ   )r:   rv   rD   r   r   extended_attention_maskr   rE   seq_idscausal_maskprefix_seq_lens              r<   get_extended_attention_maskz)BlipTextModel.get_extended_attention_maskv  s   & 1$&4Qa]&C#!Q& )4&
J,,z&A%dD!m4;;J
TUVZabfhikoboZpp)nn^-A-AB$$Q'.*>*>q*AA%3%9%9!%<{?P?PQR?S%SN"'))!JJ!+Z HQW_j_p_p (	  #K +6aq!m*D~VWY]_cefVfGg*g'*8D$9I*J'3K=@[\j\p\p[qqrs  #:"<"<4::"<"N#&)@#@H"L&&r=   r>   r    rw   r?   encoder_embedsrx   ry   r   r   r{   r   r   r|   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|r|
|
n| j                   j                  }
nd}
||t        d      |4| j                  ||       |j                         }|\  }}|j                  }nY|%|j                         dd }|\  }}|j                  }n2|%|j                         dd }|\  }}|j                  }nt        d      d}|	5t        |	t              s|	d   d   j                  d   n|	j                         }|)t        j                  |||z   f      j                  |      }| j!                  ||||      }|t        |t"              r|d   j                         \  }}}n|j                         \  }}}||f}t        |t"              r|D cg c]  }| j%                  |       }}n?|)t        j                  ||      }| j%                  |      }n| j%                  |      }nd}| j'                  || j                   j(                        }|| j+                  ||||	      }n|}| j-                  ||||||	|
||||
      }|d   }| j.                  | j/                  |      nd}|s
||f|dd z   S t1        |||j2                  |j4                  |j6                  |j8                        S c c}w )a.  
        encoder_hidden_states  (`torch.FloatTensor`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
            the model is configured as a decoder.
        encoder_attention_mask (`torch.FloatTensor`, *optional*):
            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*):
            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).
        NFzDYou cannot specify both input_ids and inputs_embeds at the same timer!   zGYou have to specify either input_ids or inputs_embeds or encoder_embedsr   r~   rO  )r>   r    r?   r@   )
rv   rw   rx   ry   r   r   r{   r   r   r|   r   )r   pooler_outputr   ru   r   r   )r8   r{   r   use_return_dictr   rZ   %warn_if_padding_and_no_attention_maskrC   r   r   r   r   get_seq_lengthr4   rT  r   rY  listinvert_attention_maskget_head_maskr   rF   rA  rB  r   r   ru   r   r   ) r:   r>   rv   r    rw   r?   rZ  rx   ry   r   r   r{   r   r   r   r|   rD   r   rE   r   r@   rU  encoder_batch_sizeencoder_sequence_lengthr   encoder_hidden_shapemaskencoder_extended_attention_maskembedding_outputencoder_outputsr)  r  s                                    r<   rG   zBlipTextModel.forward  sv   H 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]%.%:	@U@UII ]%>cdd"66y.Q#..*K%0"J
%%F&',,.s3K%0"J
"))F'(--/4K%0"J
#**Ffgg!"& "/59  "1%++B/$335 # !"ZZZBX5X(YZ]]^deN 150P0PK1
 !,/6AVWXAYA^A^A`>"$;QAVA[A[A]>"$;Q$68O#P 0$7`v2wX\43M3Md3S2w/2w'/).4HQW)X&262L2LMc2d/262L2LMc2d/.2+ &&y$++2O2OP	!##)+'=	  /    .,,2"7#B+/!5#) ' 
 *!,8<8OO4UY#]3oab6III;-'+;;)77&11,==
 	
[ 3xs   K9)T)NNNNNNNNNNNNNFN)rH   rI   rJ   rK   r&   rG  rI  rM  r   r   rN   r   r   rY  r   r4   r`  rM   r   r   rG   rO   rP   s   @r<   r?  r?  T  s   /0C<'$<'38:<'GM<'[_<'	<'@ -115/3,004158<9==A$(,0/3&*%*15!P
ELL)P
 !.P
 u||,	P

 ELL)P
  -P
 !.P
  (5P
 !) 6P
 "$u'8'8"9:P
 D>P
 $D>P
 'tnP
 d^P
 TNP
  !.!P
" 
uU\\"$PP	Q#P
r=   r?  c            '           e Zd Z fdZd Zd Zd Zd Z	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddee	j                     dee	j                     dee	j                     d	ee	j                     d
ee	j                     dee	j                     dee	j                     dee	j                     deee	j                        dee   dee   dee   dee   dee   dee   dee   dee	j                     deee	j                     ef   f$dZd fd	Z xZS )BlipTextLMHeadModelc                     t         |   |       t        |d      | _        t	        |      | _        |j                  | _        y )NF)rD  )r%   r&   r?  r.  r&  clslabel_smoothingr9   s     r<   r&   zBlipTextLMHeadModel.__init__I  s8     !&EB	&v.%55r=   c                 6    | j                   j                         S rh   )r.  rG  rm   s    r<   rG  z(BlipTextLMHeadModel.get_input_embeddingsP  s    yy--//r=   c                 :    | j                   j                  |       y rh   )r.  rI  r:   new_embeddingss     r<   rI  z(BlipTextLMHeadModel.set_input_embeddingsS  s    		&&~6r=   c                 B    | j                   j                  j                  S rh   )rm  r(  r  rm   s    r<   get_output_embeddingsz)BlipTextLMHeadModel.get_output_embeddingsV  s    xx##+++r=   c                     || j                   j                  _        |j                  | j                   j                  _        y rh   )rm  r(  r  r  rq  s     r<   set_output_embeddingsz)BlipTextLMHeadModel.set_output_embeddingsY  s,    '5$$2$7$7!r=   r>   rv   r    rw   r?   rx   ry   labelsr   r   r{   r   r   return_logitsr   	reductionr|   rA   c                 4   ||n| j                   j                  }|d}
| j                  ||||||||	|
|||||      }|d   }| j                  |      }|r|ddddddf   j	                         S d}||ddddddf   j	                         }|ddddf   j	                         j                  |j                        }t        || j                        } ||j                  d| j                   j                        |j                  d            }|dk(  r0|j                  |j                  d      d      j                  d      }|s|f|d	d z   }||f|z   S |S t        |||j                  |j                  |j                   |j"                  
      S )a  
        encoder_hidden_states (`torch.FloatTensor`, *optional*): Sequence of
            hidden-states at the output of the last layer of the encoder. Used in the cross-attention if the model is
            configured as a decoder.
        encoder_attention_mask (`torch.FloatTensor`, *optional*):
            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
        labels (`torch.LongTensor`, *optional*):
            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*):
            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
            `past_key_values`).
        NF)rv   r    rw   r?   rx   ry   r   r   r{   r   r   r   r|   r   r!   r   )ry  rn  nonerW   )losslogitsr   ru   r   r   )r8   r]  r.  rm  r   r   r   r   rn  r   r(   rC   sumr   r   ru   r   r   )r:   r>   rv   r    rw   r?   rx   ry   rw  r   r   r{   r   r   rx  r   ry  r|   r   r)  r+  lm_lossshifted_prediction_scoresloss_fctr   s                            r<   rG   zBlipTextLMHeadModel.forward]  s   T &1%<k$++B]B]I)))%'"7#9+/!5#!)  
" "!* HH_5$QQY/::<<(9!SbS!)(D(O(O(Q%AqrE]--/223L3S3STF')TMaMabH8==b$++BXBXY[a[f[fgi[jkGF"!,,'8'='=a'@"EII!L')GABK7F,3,?WJ'KVK0$#33!//))$55
 	
r=   c                 8    t        |   |f||d|}d|d<   |S )N)r   rv   Tr   )r%   prepare_inputs_for_generation)r:   r>   r   rv   model_kwargsmodel_inputsr;   s         r<   r  z1BlipTextLMHeadModel.prepare_inputs_for_generation  s>     w<
+)
 	
 &*\"r=   )NNNNNNNNNNNNNFTr0  N)NN)rH   rI   rJ   r&   rG  rI  rt  rv  r   r4   r   r`  r   r   r   r   r   rG   r  rO   rP   s   @r<   rk  rk  H  s   607,8 -115/3,0048<9=)-8<$(,0/3&*(-%)#)15%Z
ELL)Z
 !.Z
 u||,	Z

 ELL)Z
  -Z
  (5Z
 !) 6Z
 &Z
 "$u||"45Z
 D>Z
 $D>Z
 'tnZ
 d^Z
  ~Z
  TN!Z
" C=#Z
$ !.%Z
& 
uU\\"$EE	F'Z
x r=   rk  )r?  rk  r-  )6r   typingr   r   r4   torch.utils.checkpointr   r   r   torch.nnr   activationsr
   cache_utilsr   r   r   
generationr   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   r   r   r   utilsr   configuration_blipr   
get_loggerrH   r  Moduler   rR   r   r   r   r   r   r   r  r  r  r&  r-  r?  rk  __all__r   r=   r<   <module>r     sW  "  "   $ $ % ! C C ) 9 
   . 
		H	%0 0hP.BII P.h .		 .d299  RYY 9. 9z_
bii _
FRYY  bii $ryy 0!")) !%o %0p
+ p
h|1? |~ Nr=   