
    rhJ                       d Z ddlZddlmZmZ ddlZddlZddlmZ ddlm	Z	 ddl
mZmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZmZmZ ddlmZ ddlmZmZmZm Z m!Z!m"Z"m#Z# ddl$m%Z%m&Z&m'Z'  e        rddl(m)Z) ddl*m+Z+  e#jX                  e-      Z. G d dej^                        Z0	 ddl1m2Z2 e2Z0e.jg                  d        G d dej^                        Z7 G d dej^                        Z8 G d dej^                        Z9 G d de      Z: G d d ej^                        Z;e G d! d"e             Z<e G d# d$e<             Z= G d% d&ej^                        Z> G d' d(ej^                        Z? G d) d*ej^                        Z@ G d+ d,ej^                        ZA G d- d.ej^                        ZB G d/ d0e      ZC ed12       G d3 d4e<             ZD ed52       G d6 d7e<e             ZEg d8ZFy# e4$ r Y /e5$ r e.jm                  d       Y Gw xY w)9zPix2Struct modeling file    N)OptionalUnion)nn   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)AttentionMaskConverter)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling!CausalLMOutputWithCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput)PreTrainedModel)DUMMY_INPUTS
DUMMY_MASKauto_docstringis_torch_flex_attn_availableis_torch_fx_proxyis_torchdynamo_compilinglogging   )Pix2StructConfigPix2StructTextConfigPix2StructVisionConfig)	BlockMask)make_flex_block_causal_maskc                   &     e Zd Zd fd	Zd Z xZS )Pix2StructLayerNormc                     t         |           t        j                  t	        j
                  |            | _        || _        y)zc
        Construct a layernorm module in the T5 style. No bias and no subtraction of mean.
        N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__s      /var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/pix2struct/modeling_pix2struct.pyr%   zPix2StructLayerNorm.__init__>   s1     	ll5::k#:; #    c                    |j                  t        j                        j                  d      j	                  dd      }|t        j
                  || j                  z         z  }| j                  j                  t        j                  t        j                  fv r%|j                  | j                  j                        }| j                  |z  S )N   T)keepdim)tor'   float32powmeanrsqrtr*   r)   dtypefloat16bfloat16)r+   hidden_statesvariances      r/   forwardzPix2StructLayerNorm.forwardF   s     !##EMM266q9>>r4>P%Ht?T?T4T(UU ;; ??),,T[[->->?M{{]**r0   )gư>__name__
__module____qualname__r%   r?   __classcell__r.   s   @r/   r"   r"   =   s    $+r0   r"   )FusedRMSNormzWDiscovered apex.normalization.FusedRMSNorm - will use it instead of Pix2StructLayerNormzJDiscovered apex but it failed to load, falling back to Pix2StructLayerNormc                   d     e Zd ZdZdeddf fdZdej                  dej                  fdZ xZ	S )Pix2StructVisionEmbeddingsa-  
    Construct the embeddings from patch. In `Pix2Struct` the input is different from classic Vision-transformer models.
    Here the input is a sequence of `seq_len` flattened patches that also combines padding patches (tokens). Each patch
    is represented by a vector of `hidden_size` values.
    configreturnNc                    t         |           t        j                  |j                  |j
                        | _        t        j                  |j                  |j
                        | _	        t        j                  |j                  |j
                        | _
        t        j                  |j                        | _        y N)r$   r%   r   Linearpatch_embed_hidden_sizer,   patch_projection	Embeddingseq_lenrow_embeddercolumn_embedderDropoutdropout_ratedropoutr+   rI   r.   s     r/   r%   z#Pix2StructVisionEmbeddings.__init__k   s}     "		&*H*H&J\J\ ]LL9K9KL!||FNNF<N<NOzz&"5"56r0   flattened_patchesc                 "   |d d d d df   j                         }|d d d d df   j                         }|d d d d dd f   }| j                  |      }| j                  |      }| j                  |      }||z   |z   }| j	                  |      }|S )Nr   r   r2   )longrO   rR   rS   rV   )r+   rX   row_indicescol_indices
embeddingsrow_embeddingscol_embeddingss          r/   r?   z"Pix2StructVisionEmbeddings.forwardt   s     (1a0557'1a0557-aABh7**+<=
**;7--k:  .0>A
\\*-
r0   )
rA   rB   rC   __doc__r   r%   r'   Tensorr?   rD   rE   s   @r/   rH   rH   d   s7    7/ 7D 7 %,, r0   rH   c                   .     e Zd Z fdZ	 	 	 	 ddZ xZS )Pix2StructVisionAttentionc                 |   t         |           |j                  | _        |j                  | _        |j
                  | _        |j                  | _        | j                  | j                  z  | _	        t        j                  | j                  | j                  d      | _        t        j                  | j                  | j                  d      | _        t        j                  | j                  | j                  d      | _        t        j                  | j                  | j                  d      | _        d| _        y NFbias)r$   r%   r,   d_kvkey_value_proj_dimnum_attention_headsn_headsattention_dropoutrV   	inner_dimr   rM   querykeyvalueoutputgradient_checkpointingrW   s     r/   r%   z"Pix2StructVisionAttention.__init__   s    !--"(++11//(?(?? YYt//eL
99T--t~~EJYYt//eL
ii0@0@uM&+#r0   c                 6    |j                   dd \  } fd} | j                  |            } | j                  |            }	 | j                  |            }
t	        j
                  ||	j                  dd            }|t	        j                  d j                  ||f|j                  |j                        } j                  r j                  rd|_        |j                         dk(  r*||ddddddf   j                  |j                        z   }nw|||j                  |j                        z   }nVt!               sLt	        j"                  |f|j                  |j                        }||j                  |j                        z   }d|z
  }|j%                  |dk(  t	        j&                  |j                        j(                        }||z  }t	        j*                  |t	        j,                  t	        j&                  |j                        j(                              }t.        j0                  j3                  |dt        j4                  	      j7                  |      }t.        j0                  j9                  | j8                   j                  
      }|||z  }t	        j
                  ||
      }|j                  dd      j;                         j=                  d j>                        } jA                  |      }|f|fz   }|r||fz   }|S )z&
        Self-attention block
        Nr2   c                     | j                         j                  dj                  j                        j	                  dd      S )
projectionr3   r   r2   )
contiguousviewrk   ri   	transpose)states
batch_sizer+   s    r/   to_projection_shapez>Pix2StructVisionAttention.forward.<locals>.to_projection_shape   s<    $$&++JDLL$JaJabllmnpqrrr0   r   r   devicer:   Tr3   )dimr:   ptraining)!shapern   ro   rp   r'   matmulrx   zerosrk   r}   r:   rr   r   requires_gradr~   r5   r   r(   masked_fillfinfominmaxtensorr   
functionalsoftmaxr6   type_asrV   rv   rw   rm   rq   )r+   r=   attention_maskposition_biaslayer_head_maskoutput_attentions
seq_lengthr{   query_states
key_statesvalue_statesscoresposition_bias_maskedattn_weightsattn_outputoutputsrz   s   `               @r/   r?   z!Pix2StructVisionAttention.forward   s    "/!4!4Ra!8
J	s +4::m+DE )-)@A
*4::m+DE lJ,@,@A,FG !KKDLL*j9&--W]WcWcM **t}}.2+!!#q( -q$a?O0P0S0STaThTh0i i+ -0A0A-BVBV0W W-/!&,]5I5IQ^QdQd" !.0A0A-BVBV0W W-M,88!9KU[[Y_YeYeMfMjMjk&&65<<FLL0I0M0M#NO }},,V5==,QYYZ`a }},,\T\\TXTaTa,b &'/9Lll<> "++Aq1<<>CCJPRTXTbTbckk+..M#33/Gr0   )NNNFr@   rE   s   @r/   rc   rc      s    ,& Mr0   rc   c                   *     e Zd Zdef fdZd Z xZS )Pix2StructVisionMlprI   c                    t         |           t        j                  |j                  |j
                  d      | _        t        j                  |j                  |j
                  d      | _        t        j                  |j
                  |j                  d      | _        t        j                  |j                        | _        t        |j                     | _        y re   r$   r%   r   rM   r,   d_ffwi_0wi_1worT   rU   rV   r   dense_act_fnactrW   s     r/   r%   zPix2StructVisionMlp.__init__       IIf00&++EJ	IIf00&++EJ	))FKK););%Hzz&"5"56&--.r0   c                 ,   | j                  | j                  |            }| j                  |      }||z  }| j                  |      }t	        | j
                  j                  t        j                        r|j                  | j
                  j                  j                  k7  r`| j
                  j                  j                  t        j                  k7  r/|j                  | j
                  j                  j                        }| j                  |      }|S rL   r   r   r   rV   
isinstancer   r)   r'   ra   r:   int8r5   r+   r=   hidden_geluhidden_linears       r/   r?   zPix2StructVisionMlp.forward       hhtyy78		-0#m3]3 tww~~u||4##tww~~';';;$$

2),,TWW^^-A-ABM.r0   )rA   rB   rC   r   r%   r?   rD   rE   s   @r/   r   r      s    /5 /r0   r   c                        e Zd Zdeddf fdZ	 	 	 d
dej                  deej                     deej                     dede	e
ej                  ej                  f   e
ej                     f   f
d	Z xZS )Pix2StructVisionLayerrI   rJ   Nc                 *   t         |           |j                  | _        d| _        t	        |      | _        t        |      | _        t        |j                  |j                        | _        t        |j                  |j                        | _        y )Nr   r-   )r$   r%   chunk_size_feed_forwardseq_len_dimrc   	attentionr   mlpr"   r,   layer_norm_epspre_mlp_layer_normpre_attention_layer_normrW   s     r/   r%   zPix2StructVisionLayer.__init__  ss    '-'E'E$26:&v."5f6H6HfNcNc"d(;F<N<NTZTiTi(j%r0   r=   r   	head_maskr   c                     |}| j                  |      }| j                  ||||      }|d   }|dd  }||z   }| j                  |      }	| j                  |	      |z   }	|	f|z   }|S )N)r   r   r   r   r   )r   r   r   r   )
r+   r=   r   r   r   residualself_attention_outputsattention_outputr   layer_outputs
             r/   r?   zPix2StructVisionLayer.forward  s     ! 55mD!%)%/	 "0 "
 2!4(, )83 ..}=xx-=/G+r0   )NNF)rA   rB   rC   r   r%   r'   ra   r   boolr   tupler?   rD   rE   s   @r/   r   r     s    k/ kD k 26,0"'|| !. ELL)	
   
uU\\5<</0%2EE	Fr0   r   c                        e Zd Zdeddf fdZ	 	 	 	 	 ddej                  deej                     deej                     ded	ed
ede	e
ef   fdZ xZS )Pix2StructVisionEncoderrI   rJ   Nc                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w )NF)
r$   r%   rI   r   
ModuleListrangenum_hidden_layersr   layerrr   )r+   rI   _r.   s      r/   r%   z Pix2StructVisionEncoder.__init__3  sP    ]]5QWQiQiKj#ka$9&$A#kl
&+# $ls   A#r=   r   r   r   output_hidden_statesreturn_dictc                    |rdnd }|rdnd }t        | j                        D ]2  \  }	}
|r||fz   }|||	   nd } |
||||      }|d   }|s*||d   fz   }4 |r||fz   }|st        d |||fD              S t        |||      S )N r   r   c              3   &   K   | ]	  }||  y wrL   r   .0vs     r/   	<genexpr>z2Pix2StructVisionEncoder.forward.<locals>.<genexpr>V  s     mq_`_lms   last_hidden_stater=   
attentions)	enumerater   r   r   )r+   r=   r   r   r   r   r   all_hidden_statesall_self_attentionsilayer_moduler   layer_outputss                r/   r?   zPix2StructVisionEncoder.forward9  s     #7BD$5b4(4 	POA|#$58H$H!.7.CilO(YjkM)!,M &9]1=M<O&O#	P   1]4D Dm]4EGZ$[mmm++*
 	
r0   )NNFFT)rA   rB   rC   r   r%   r'   ra   r   r   r   r   r   r?   rD   rE   s   @r/   r   r   2  s    ,/ ,D , 26,0"'%* "
||"
 !."
 ELL)	"

  "
 #"
 "
 
uo%	&"
r0   r   c                   8    e Zd ZU eed<   dZed        Zd Zd Z	y)Pix2StructPreTrainedModelrI   Fc                 v    t        j                  t              }t        j                  t              }|||d}|S )N)decoder_input_ids	input_idsdecoder_attention_mask)r'   r   r   r   )r+   r   
input_maskdummy_inputss       r/   r   z&Pix2StructPreTrainedModel.dummy_inputsd  s6    LL.	\\*-
!*"&0

 r0   c                 4   | j                   j                  }t        |t              r)|j                  j
                  j                  |dz         yt        |t              rVt        | j                   t              r | j                   j                  j                  n| j                   j                  }t        | j                   t              r | j                   j                  j                  n| j                   j                  }|j                  j                  j
                  j                  d||dz  z         t        |j                  d      rD|j                  j                  .|j                  j                  j
                  j!                          |j"                  j                  j
                  j                  d||dz  z         t        |j"                  d      rD|j"                  j                  .|j"                  j                  j
                  j!                          |j$                  j                  j
                  j                  d||dz  z         t        |j$                  d      rF|j$                  j                  /|j$                  j                  j
                  j!                          yyyt        |t&              rt        | j                   t              r | j                   j                  j                  n| j                   j                  }t        | j                   t              r | j                   j                  j(                  n| j                   j                  }t        | j                   t              r | j                   j                  j*                  n| j                   j*                  }|j,                  j                  j
                  j                  d|||z  dz  z         |j.                  j                  j
                  j                  d||dz  z         |j0                  j                  j
                  j                  d||dz  z         |j2                  j                  j
                  j                  d|||z  dz  z         |j4                  r8|j6                  j                  j
                  j                  d||dz  z         yyt        |t8        j:                        rt        | j                   t              r | j                   j                  j                  n| j                   j                  }|j                  j
                  j                  d||dz  z         |j<                  2|j                  j
                  |j<                     j!                          yyt        |t>              rt        | j                   t              r | j                   j                  j                  n| j                   j                  }|j@                  j                  j
                  j                  d||dz  z         yt        |t8        jB                  t8        jD                  f      rt8        jF                  jI                  |j                  j
                  jK                  tL        jN                        d| j                   jP                        jK                  |j                  jR                        |j                  _        |j                  %|j                  j
                  j!                          yyt        |t              r3|j                  &|j                  j
                  j                  d       yyt        |t8        j:                        rz|j                  j
                  j                  d| j                   jP                         |j<                  2|j                  j
                  |j<                     j!                          yyy)zInitialize the weights      ?        g      )r8   stdrg   N)*rI   initializer_factorr   r"   r)   datafill_ Pix2StructTextDenseGatedActDenser   text_configr,   r   r   normal_hasattrrg   zero_r   r   Pix2StructTextAttentionrh   	num_headsrn   ro   rp   rq   has_relative_attention_biasrelative_attention_biasr   rP   padding_idxPix2StructTextModellm_headrM   Conv2dinittrunc_normal_r5   r'   r6   initializer_ranger:   )r+   modulefactorr,   r   ri   rk   s          r/   _init_weightsz'Pix2StructPreTrainedModel._init_weightso  s   //f12MM$$Vc\2 @A dkk+;< ''33[[,, 
 4>dkkK[3\4;;**//bfbmbmbrbrDKK##++&[UYDY:Z+[v{{F+0@0@0L  %%++-KK##++&[UYDY:Z+[v{{F+0@0@0L  %%++-II!!))s4D.8Q)Rvyy&)fiinn.H		##))+ /I) 78
 dkk+;< ''33[[,,  1;4;;HX0Y'',,_c_j_j_v_v 
 dkk+;< ''11[[**  LL$$,,#6kTfFfkoEo;p,qJJ""**;PTCT9U*VLL$$,,#6[RVEV;W,XMM  %%--3FwQcGchlFl<m-n11..55::BBQW\glp[pQqBr 2- dkk+;< ''33[[,,  MM&&CVPT?T5U&V!!-""6#5#56<<> . 34 dkk+;< ''33[[,,  NN!!&&..CVX\G\=].^BII 67 "$!6!6""%%emm43DKKDaDa "7 "b$$% MM {{&  &&( ' 34}}(""((- )-MM&&CT[[5R5R&S!!-""6#5#56<<> . .r0   c                    | j                   j                  }| j                   j                  }|t        d      t	        |      rGt        j                  |j                  d d dz   |      }t        j                  ||dd df   gd      }n>|j                  |j                        }|dd df   j                         |ddd f<   ||d<   |t        d      |j                  |d	k(  |       |S )
Nzself.model.config.decoder_start_token_id has to be defined. In Pix2Struct it is usually set to the pad_token_id. See Pix2Struct docs for more information.r3   )r   .r~   r   ).r   z1self.model.config.pad_token_id has to be defined.)rI   decoder_start_token_idpad_token_id
ValueErrorr   r'   fullr   cat	new_zerosclonemasked_fill_)r+   r   r   r  shifted_input_idss        r/   _shift_rightz&Pix2StructPreTrainedModel._shift_right  s    !%!C!C{{//!)<  Y' %

9??3B+?$+FH^ _ %		+<iSbS>Q*RXZ [ ) 3 3IOO D)238)<)B)B)Dc12g&(>f%PQQ&&'8D'@,O  r0   N)
rA   rB   rC   r   __annotations___can_compile_fullgraphpropertyr   r   r	  r   r0   r/   r   r   ^  s,    " M?`!r0   r   c                   
    e Zd ZU eed<   dZdZdgZdef fdZ	d Z
deeee   f   dd	fd
Ze	 	 	 	 	 	 ddeej$                     deej$                     deej$                     dee   dee   dee   deeef   fd       Z xZS )Pix2StructVisionModelrI   rX   Tr   c                     t         |   |       || _        t        |      | _        t        |      | _        t        |j                  |j                        | _
        | j                          y Nr   )r$   r%   rI   rH   r]   r   encoderr"   r,   r   	layernorm	post_initrW   s     r/   r%   zPix2StructVisionModel.__init__  sU     4V<.v6,V-?-?VEZEZ[ 	r0   c                 .    | j                   j                  S rL   )r]   rO   r+   s    r/   get_input_embeddingsz*Pix2StructVisionModel.get_input_embeddings  s    ///r0   heads_to_prunerJ   Nc                     |j                         D ]7  \  }}| j                  j                  |   j                  j	                  |       9 y)z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr  r   r   prune_heads)r+   r  r   headss       r/   _prune_headsz"Pix2StructVisionModel._prune_heads  sE    
 +002 	CLE5LLu%//;;EB	Cr0   r   r   r   r   r   c                 (   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|t	        d      |#|j                  d      dk7  j                         }| j                  || j                   j                        }| j                  |      }| j                  ||||||      }|d   }	| j                  |	      }	|s|	f}
|
|dd z   S t        |	|j                  |j                        S )	a  
        flattened_patches (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_channels x patch_height x patch_width)`):
            Flattened and padded pixel values. These values can be obtained using [`AutoImageProcessor`]. See
            [`Pix2StructVisionImageProcessor.__call__`] for details. Check the [original
            paper](https://huggingface.co/papers/2210.03347) (figure 5) for more details.

        Example:

        ```python
        >>> import requests
        >>> from PIL import Image
        >>> from transformers import AutoProcessor, Pix2StructVisionModel

        >>> image_processor = AutoProcessor.from_pretrained("google/pix2struct-textcaps-base")
        >>> model = Pix2StructVisionModel.from_pretrained("google/pix2struct-textcaps-base")

        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = image_processor(images=image, return_tensors="pt")
        >>> with torch.no_grad():
        ...     outputs = model(**inputs)

        >>> last_hidden_states = outputs.last_hidden_state
        >>> list(last_hidden_states.shape)
        [1, 2048, 768]
        ```
        Nz%You have to specify flattened_patchesr3   r   r   )r   r   r   r   r   r   r   )rI   r   r   use_return_dictr  sumfloatget_head_maskr   r]   r  r  r   r=   r   )r+   rX   r   r   r   r   r   embedding_outputencoder_outputssequence_outputhead_outputss              r/   r?   zPix2StructVisionModel.forward  s8   L 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]$DEE!/333;q@GGIN &&y$++2O2OP	??+<=,,)/!5# ' 
 *!,..9+-L/!""555-)77&11
 	
r0   )NNNNNN)rA   rB   rC   r   r
  main_input_namesupports_gradient_checkpointing_no_split_modulesr   r%   r  dictintlistr  r   r   r'   ra   r   r   r   r   r?   rD   rE   s   @r/   r  r    s    "")O&*#01
/ 
0C4T#Y+? CD C  5915,0,0/3&*N
#ELL1N
 !.N
 ELL)	N

 $D>N
 'tnN
 d^N
 
u00	1N
 N
r0   r  c                   *     e Zd Zdef fdZd Z xZS )r   rI   c                    t         |           t        j                  |j                  |j
                  d      | _        t        j                  |j                  |j
                  d      | _        t        j                  |j
                  |j                  d      | _        t        j                  |j                        | _        t        |j                     | _        y re   r   rW   s     r/   r%   z)Pix2StructTextDenseGatedActDense.__init__M  r   r0   c                 ,   | j                  | j                  |            }| j                  |      }||z  }| j                  |      }t	        | j
                  j                  t        j                        r|j                  | j
                  j                  j                  k7  r`| j
                  j                  j                  t        j                  k7  r/|j                  | j
                  j                  j                        }| j                  |      }|S rL   r   r   s       r/   r?   z(Pix2StructTextDenseGatedActDense.forwardU  r   r0   rA   rB   rC   r   r%   r?   rD   rE   s   @r/   r   r   L  s    /3 /r0   r   c                   *     e Zd Zdef fdZd Z xZS )Pix2StructTextLayerFFrI   c                     t         |           t        |      | _        t	        |j
                  |j                        | _        t        j                  |j                        | _        y r  )r$   r%   r   DenseReluDenser"   r,   layer_norm_epsilon
layer_normr   rT   rU   rV   rW   s     r/   r%   zPix2StructTextLayerFF.__init__j  sK    >vF-f.@.@fF_F_`zz&"5"56r0   c                 r    | j                  |      }| j                  |      }|| j                  |      z   }|S rL   )r5  r3  rV   )r+   r=   forwarded_statess      r/   r?   zPix2StructTextLayerFF.forwardr  s=    ??=9../?@%5E(FFr0   r/  rE   s   @r/   r1  r1  i  s    73 7r0   r1  c                   f     e Zd Z	 ddedee   f fdZedd       Zd	dZ		 	 	 	 	 	 	 	 	 d
dZ
 xZS )r   rI   	layer_idxc                    t         |           || _        |j                  | _        |j                  | _        |j
                  | _        |j                  | _        |j                  | _	        |j                  | _        | j                  | j                  z  | _        || _        |-t        j                  d| j                   j"                   d       t%        j&                  | j
                  | j
                  d      | _        t%        j&                  | j
                  | j
                  d      | _        t%        j&                  | j
                  | j
                  d      | _        t%        j&                  | j
                  | j
                  d      | _        | j                  r/t%        j0                  | j                  | j                        | _        t5               | _        d| _        y )NzInstantiating a decoder z without passing `layer_idx` is not recommended and will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.Frf   )r$   r%   r   relative_attention_num_bucketsrelative_attention_max_distancer,   rh   ri   r   rk   rU   rV   rm   r9  loggerwarning_oncer.   rA   r   rM   rn   ro   rp   rq   rP   r   setpruned_headsrr   r+   rI   r   r9  r.   s       r/   r%   z Pix2StructTextAttention.__init__z  ss    	+F(.4.S.S+/5/U/U,!--"(++''**(?(??"*4>>+B+B*C D, , YYt//1A1AN
99T--t/?/?eLYYt//1A1AN
ii 0 0$2B2BO+++-<<8[8[]a]i]i+jD(E&+#r0   c                 T   d}|rC|dz  }|| dkD  j                  t        j                        |z  z  }t        j                  |       } n*t        j                  | t        j
                  |              } |dz  }| |k  }|t        j                  | j                         |z        t        j                  ||z        z  ||z
  z  j                  t        j                        z   }t        j                  |t        j                  ||dz
              }|t        j                  || |      z  }|S )a  
        Adapted from Mesh Tensorflow:
        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593

        Translate relative position to a bucket number for relative attention. The relative position is defined as
        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
        This should allow for more graceful generalization to longer sequences than the model has been trained on

        Args:
            relative_position: an int32 Tensor
            bidirectional: a boolean - whether the attention is bidirectional
            num_buckets: an integer
            max_distance: an integer

        Returns:
            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
        r   r2   r   )r5   r'   rZ   absr   
zeros_likelogr   math	full_likewhere)relative_positionbidirectionalnum_bucketsmax_distancerelative_buckets	max_exactis_smallrelative_position_if_larges           r/   _relative_position_bucketz1Pix2StructTextAttention._relative_position_bucket  s(   . AK!2Q!6 : :5:: F TT %		*; <!&+<e>N>NO`>a!b b  1$	$y0 &/II'--/);<hh|i/01Y&( "UZZ.	&"
 &+YY&8RT_bcTc(d&
" 	EKK2CE_``r0   c                    | | j                   j                  j                  }|.t        j                  |t        j
                  |      dddf   }n|dddf   j                  |      }t        j                  |t        j
                  |      dddf   }||z
  }| j                  |d| j                  | j                        }| j                  |      }	|	j                  g d      j                  d      }	|	S )z%Compute binned relative position biasN)r:   r}   F)rJ  rK  rL  )r2   r   r   r   )r   r)   r}   r'   arangerZ   r5   rQ  r;  r<  permute	unsqueeze)
r+   query_length
key_lengthr}   cache_positioncontext_positionmemory_positionrI  relative_position_bucketvaluess
             r/   compute_biasz$Pix2StructTextAttention.compute_bias  s    >1188??F!$||L

SYZ[\^b[bc-ag699&A,,zFSTXZ[T[\+.>>#'#A#A;;==	 $B $
  --.FG	*44Q7r0   c                 b   |j                   dd \  }}|du}| j                  |      }|j                  |d| j                  | j                        j                  dd      }|Qt        |t              rA|j                  j                  | j                        }|r|j                  }n|j                  }n|}|r|n|}|rK|rIrG|j                  | j                     j                  }|j                  | j                     j                  }n| j!                  |      }| j#                  |      }|j                  |d| j                  | j                        j                  dd      }|j                  |d| j                  | j                        j                  dd      }|D|s|
nd}
|j%                  ||| j                  d|
i      \  }}|rd|j                  | j                  <   t'        j(                  ||j                  dd            }||j                   d   }||n|
d   dz   }| j*                  sZt'        j,                  d| j                  ||f|j.                  |j0                  	      }| j2                  rE| j4                  r9d|_        n1| j9                  |||j.                  |

      }|dddd| dddf   }|#|ddddddd|j                   d   f   }||z   }| j:                  rRt'        j<                  |j                   d         }d|t?        | j:                        <   |dd|jA                         f   }n|}||z  }tB        jD                  jG                  |jI                         d      jK                  |      }tB        jD                  jM                  || jL                  | j4                        }|||z  }t'        j(                  ||      }|j                  dd      jO                         }|j                  |d| jP                        }| jS                  |      }||f}|	r||fz   }|S )z
        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
        Nr2   r3   r   rX  Tr   r|   )r}   rX  r   r   r   )*r   rn   rw   rk   ri   rx   r   r
   
is_updatedgetr9  cross_attention_cacheself_attention_cachelayerskeysr\  ro   rp   updater'   r   r   r   r}   r:   rr   r   r   r]  r@  r(   r+  r   r   r   r   r   r   rV   rv   rm   rq   )r+   r=   maskkey_value_statesr   past_key_valuer   rV  	use_cacher   rX  rz   r   is_cross_attentionr   r`  curr_past_key_valuecurrent_statesr   r   r   rW  real_seq_lengthcausal_maskr   r   r   r   s                               r/   r?   zPix2StructTextAttention.forward  s   $ "/!4!4Ra!8
J .T9zz-0#((RtG^G^_iijkmno %*^EX*Y'2266t~~FJ!&4&J&J#&4&I&I#"0-?)].Z,33DNNCHHJ.55dnnELLL.1J::n5L#RtG^G^_iijkmnoJ',,ZT\\4KbKbcmmnoqrsL)7It+>+E+Ednn?OQ_>`,(
L &@DN--dnn= lJ,@,@A,FG #))"-J.:.FlN[]L^abLbO33 %j*=fmm[a[g[g! ..4==26M/ $ 1 1#ZVd !2 ! !.aZKL!.C D"1a,Bj.>.>r.B,B#BC - ;::m11!45D,-Dd''()#0DIIK#@ #0 && }},,V\\^,DLLVT}},,\T\\TXTaTa,b &'/9Lll<>!++Aq1<<>!&&z2t~~Fkk+../Gr0   FN)T       )NN)	NNNNNNFFN)rA   rB   rC   r   r   r*  r%   staticmethodrQ  r]  r?   rD   rE   s   @r/   r   r   y  s^    jn,*,ZbcfZg,> -  - `0 lr0   r   c                   B     e Zd Zddee   f fdZ	 	 	 	 	 	 	 ddZ xZS ) Pix2StructTextLayerSelfAttentionr9  c                     t         |           t        |||      | _        t	        |j
                  |j                        | _        t        j                  |j                        | _        y )Nr   r9  r   r$   r%   r   r   r"   r,   r4  r5  r   rT   rU   rV   rA  s       r/   r%   z)Pix2StructTextLayerSelfAttention.__init__Q  sU    00KW`
 .f.@.@fF_F_`zz&"5"56r0   c	           
          | j                  |      }	| j                  |	|||||||      }
|| j                  |
d         z   }|f|
dd  z   }|S )N)rg  r   r   ri  rj  r   rX  r   r   r5  r   rV   )r+   r=   r   r   r   ri  rj  r   rX  normed_hidden_statesr   r   s               r/   r?   z(Pix2StructTextLayerSelfAttention.forwardY  st      $}=>> '+)/) * 	
 &5Ea5H(II "%5ab%99r0   rp  )NNNNFFNrA   rB   rC   r   r*  r%   r?   rD   rE   s   @r/   ru  ru  P  s0    7XVY] 7 r0   ru  c                   D     e Zd Zddee   f fdZ	 	 	 	 	 	 	 	 ddZ xZS )!Pix2StructTextLayerCrossAttentionr9  c                     t         |           t        |d|      | _        t	        |j
                  |j                        | _        t        j                  |j                        | _        y )NFrw  r   rx  )r+   rI   r9  r.   s      r/   r%   z*Pix2StructTextLayerCrossAttention.__init__v  sP    0UZfop-f.@.@fF_F_`zz&"5"56r0   c                     | j                  |      }| j                  |||||||||	|

      }|| j                  |d         z   }|f|dd  z   }|S )N)	rg  rh  r   r   ri  rj  rV  r   rX  r   r   rz  )r+   r=   rh  r   r   r   ri  rj  rV  r   rX  r{  r   r   r   s                  r/   r?   z)Pix2StructTextLayerCrossAttention.forward|  sy      $}=>> -'+)%/) * 
 %t||4DQ4G'HH/$4QR$88r0   rL   )NNNNFNFNr|  rE   s   @r/   r~  r~  u  s2    7(3- 7 r0   r~  c                   L     e Zd Zddee   f fdZ	 	 	 	 	 	 	 	 	 	 	 	 ddZ xZS )Pix2StructTextBlockr9  c                     t         |           t        |||      | _        t	        ||      | _        t        |      | _        y )Nrw  )r9  )r$   r%   ru  self_attentionr~  encoder_decoder_attentionr1  r   rA  s       r/   r%   zPix2StructTextBlock.__init__  sH    >(C
 *K*
&
 )0r0   c                    | j                  |||||	|
||      }|d   }|dd  }|j                  t        j                  k(  rht        j                  |      j                         rEt        j                  |j                        j                  dz
  }t        j                  || |      }|d u}|r| j                  ||||||	|d   dz   |
|	      }|d   }|j                  t        j                  k(  rht        j                  |      j                         rEt        j                  |j                        j                  dz
  }t        j                  || |      }||dd  z   }| j                  |      }|j                  t        j                  k(  rht        j                  |      j                         rEt        j                  |j                        j                  dz
  }t        j                  || |      }|f}||z   S )N)r   r   r   ri  rj  r   rX  r   r   i  )r   r   r3   )rh  r   r   r   ri  rV  rj  r   )r  r:   r'   r;   isinfanyr   r   clampr  r   )r+   r=   r   r   encoder_hidden_statesencoder_attention_maskencoder_decoder_position_biasr   cross_attn_layer_head_maskri  rj  r   r   rX  r   attention_outputsclamp_valuedo_cross_attentioncross_attention_outputsr   s                       r/   r?   zPix2StructTextBlock.forward  s     "&!4!4)'+)/) "5 	"
 /q12126 %--/EKK4N4R4R4T++m&9&9:>>EK!KKK<[YM2$>&*&D&D!65; :-+B/!3#"3 'E 
'# 4A6M ""emm3M8R8V8V8X#kk-*=*=>BBTI %M|Q\ ] !24KAB4O O / %--/EKK4N4R4R4T++m&9&9:>>EK!KKK<[YM "***r0   rp  )NNNNNNNNFFTNr|  rE   s   @r/   r  r    s@    1XVY] 1& "#&*#'C+r0   r  z3
    The standalone text decoder of Pix2Struct
    )custom_introc            #           e Zd ZU eed<   dgZdgZdZ fdZd Z	e
	 	 	 	 	 	 	 	 	 	 	 	 	 	 d deej                     deej                     d	eej                     d
eej                     deej                     deej                     deej                     dee   dee   dee   dee   deej                     dee   deej                     deeej                  df   ef   fd       Z	 d!deej                  df   dej                  dej                  dedef
dZedej                  dededej2                  dej                  defd       Z xZS )"r   rI   r  zlm_head.weightTc                 V   t         |   |       t        j                  |j                  |j
                        | _        t        j                  t        |j                        D cg c]  }t        |t        |dk(        |       c}      | _        t        |j
                  |j                        | _        t        j                   |j"                        | _        t        j&                  |j
                  |j                  d      | _        | j+                          d| _        y c c}w )Nr   rw  r   Frf   )r$   r%   r   rP   
vocab_sizer,   embed_tokensr   r   
num_layersr  r   r   r"   r4  final_layer_normrT   rU   rV   rM   r   r  rr   )r+   rI   r   r.   s      r/   r%   zPix2StructTextModel.__init__  s     LL):):F<N<NO]] v001 $FQRSV`ab

 !4F4F4FFLeLe fzz&"5"56yy!3!3V5F5FUS 	&+#s   &!D&c                     || _         y rL   )r  r+   new_embeddingss     r/   set_input_embeddingsz(Pix2StructTextModel.set_input_embeddings  s
    *r0   r   r   r  r  inputs_embedsr   cross_attn_head_maskpast_key_valuesrj  r   r   labelsr   rX  rJ   .c                 >	   |	|	n| j                   j                  }	|
|
n| j                   j                  }
||n| j                   j                  }||n| j                   j                  }| j
                  r%| j                  r|	rt        j                  d       d}	||t        d      |&|j                         }|j                  d|d         }n!||j                         dd }nt        d      |$| j                  J d       | j                  |      }|\  }}|	r?|=| j                   j                  rt        t               t                     }n
t               }d}||d   }n||j!                         }|%t#        j$                  |||z   |j&                  	      }|9||j!                         |z   n|}t#        j(                  |||j&                  	      }| j                   j*                  r2| j-                  |||t/        |t              r|j0                  n||
      }nX|ddddddf   }|j3                  |j4                  
      }d|z
  t#        j6                  |j4                        j8                  z  }|M|j                         \  }}}||f}|!t#        j(                  ||j&                  	      }| j;                  |      }nd}| j=                  || j                   j>                        }| j=                  || j                   j>                        }|rdnd}|
rdnd}|
rdnd}d}d}| jA                  |      } tC        | jD                        D ]X  \  }!}"||!   }#||!   }$|r|| fz   } |"| ||||||#|$||	|
|      }%|%d   } |%d   }|	|%|
rdnd   }|
sD||%d   fz   }|P||%d   fz   }Z | jG                  |       } | jA                  |       } | jI                  |       }&|r|| fz   }d}'||j3                  |&j&                        }tK        jL                  dd      }( |(|&jO                         j                  d|&j                  d            |jO                         j                  d            }'|stQ        d |'|&||||fD              S tS        |'|&||||      S )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Pix2StructText is a model with relative position
            embeddings so you should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [Pix2StructText
            Training](./t5#training).
        cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
            `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        Example:

        ```python
        >>> from transformers import AutoProcessor, Pix2StructTextModel

        >>> processor = AutoProcessor.from_pretrained("google/pix2struct-textcaps-base")
        >>> model = Pix2StructTextModel.from_pretrained("google/pix2struct-textcaps-base")

        >>> inputs = processor(text="Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> loss = outputs.loss
        ```
        NzZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...FzTYou cannot specify both decoder_input_ids and decoder_inputs_embeds at the same timer3   zEYou have to specify either decoder_input_ids or decoder_inputs_embedsz<You have to initialize the model with valid token embeddingsr   r}   )r:   r   r   )r   r  ri  rj  r   rX  r   r   r2      r   r8   )ignore_index	reductionc              3   $   K   | ]  }|| 
 y wrL   r   r   s     r/   r   z.Pix2StructTextModel.forward.<locals>.<genexpr>  s       = s   )losslogitsr  r=   r   cross_attentions)*rI   rj  r   r   r  rr   r   r=  warningr  sizerw   r  is_encoder_decoderr
   r	   get_seq_lengthr'   rS  r}   r(   
is_decoder_update_causal_maskr   rc  r5   r:   r   r   invert_attention_maskr!  r  rV   r   r   r  r   r   CrossEntropyLossrv   r   r   ))r+   r   r   r  r  r  r   r  r  rj  r   r   r  r   rX  kwargsinput_shaperz   r   past_key_values_lengthmask_seq_lengthro  encoder_batch_sizeencoder_sequence_lengthr   encoder_hidden_shapeencoder_extended_attention_maskr   all_attentionsall_cross_attentionsr   r  r=   r   r   r   r  r   r  r  loss_fcts)                                            r/   r?   zPix2StructTextModel.forward  s(   f "+!6IDKK<Q<Q	1B1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]&&4==YNNl I ]%>stt"#..*K!r;r?;I&',,.s3Kdee $$0p2pp0 --i8M!,
J0{{--"5lnln"U"..!"%%3A%6"(%4%C%C%E"!"\\&(>(KTaThThN ! BQA\..0:=bl  #ZZ
OML`L`aN;;!!22o/BC  44$!K )D$)9:K%..}/B/B.CK,M<O<O0P0T0TTK !,=R=W=W=Y: 7$68O#P %-).4HQ^QeQe)f&.2.H.HI_.`+.2+ &&y$++2H2HI	#112FH^H^_"6BD0d&7rd(,%]3(4 !	VOA|'lO)=a)@&#$58H$H!(%/- /+E.#"3-M *!,M
 *!,M$00=CTaZ[0\- !/=3C2E!E(4+?=QRCSBU+U(C!	VF --m<]3m,   1]4D DYYv}}-F**OHF--/44RRI6K\K\K^KcKcdfKghD  #%"(   1++%1
 	
r0   r   input_tensorc           	         | j                   j                  dk(  r||dk(  j                         r|S y | j                   j                  dk(  r't        |t        j
                        rt        |      }|S ||j                         nd}||j                  nd}| j                   j                  dk(  r(|s&|s$t        j                  |||| j                        ry |j                  }|j                  d   }	|r|j                         }
n1t        |t        j
                        r|j                  d	   n||	z   dz   }
| j                  ||	|
|||j                  d   
      }| j                   j                  dk(  rQ|O|j                   j"                  dv r7|s5t	        j$                  |      j&                  }t        j(                  ||      }|S )Nflash_attention_2r   flex_attentionr   Fsdpa)r  r  is_trainingr   r3   )sequence_lengthtarget_lengthr:   rX  rz   )cudaxpunpu)rI   _attn_implementationr  r   r'   ra   r    r  is_compileabler   _ignore_causal_mask_sdpar   r:   r   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positionr}   typer   r   _unmask_unattended)r+   r   r  rX  r  r   past_seen_tokensusing_compilable_cacher:   r  r  ro  	min_dtypes                r/   r  z'Pix2StructTextModel._update_causal_mask  s    ;;++/BB)~/D.I.I.K%%;;++/??.%,,7!<^!L!!
 @O?Z?99;`aCRC^!?!?di ;;++v5>T]n%>>*'7 MM	 ""&,,Q/!+??AM nell; $$R(%7!;  PP+')#))!, Q 
 KK,,6*%%**.DD%
 E*..I0CCKQZ[Kr0   r  r  r:   rz   c                    | | j                         dk(  r| }|S t        j                  |      j                  }t        j                  ||f|||j
                        }|dk7  rt        j                  |d      }|t        j                  ||j
                        |j                  dd      kD  z  }|ddddddf   j                  |ddd      }| |j                         }| j                  d   }	|ddddddd|	f   | ddddddf   j                  |j
                        z   }
|
dk(  }
|ddddddd|	f   j                  |
|      |ddddddd|	f<   |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        Nr  )
fill_valuer:   r}   r   )diagonalr  r3   r   )r~   r'   r   r   r  r}   triurS  reshapeexpandr  r   r5   r   )r   r  r  r:   rX  rz   r  ro  r  mask_lengthpadding_masks              r/   r  zIPix2StructTextModel._prepare_4d_causal_attention_mask_with_cache_position/  s   > %.*<*<*>!*C(K* ' E*..I** -0Ye\j\q\qK !##jjqA5<<n>S>STWeWmWmnprsWtttK%dD!Q&67>>z1bRTUK))//1,2226*1aL[L+@ANSTVZ\`bcScDdDgDg&&E    ,q05@Aq,;,AV5W5c5c )6Aq!\k\12 r0   )NNNNNNNNNNNNNN)F)rA   rB   rC   r   r
  r(  _tied_weights_keysr'  r%   r  r   r   r'   
LongTensorFloatTensorra   r   r   r   r   r   r?   r  rs  r*  r:   r  rD   rE   s   @r/   r   r     sR    ! ./*+&*#,&+  156:=A>B48157;+/$(,0/3-1&*59T
E,,-T
 !!2!23T
  ((9(9:	T

 !)):): ;T
   0 01T
 E--.T
 'u||4T
 "%T
 D>T
 $D>T
 'tnT
 ))*T
 d^T
 !!1!12T
" 
uU&&+,.OO	P#T
 T
z #(BellK78B llB 	B
 B  BH 444 4 {{	4
 4 4 4r0   r   zr
    A conditional generation model with a language modeling head. Can be used for sequence generation tasks.
    c            &       \    e Zd ZU eed<   dZdgZdef fdZd Zd Z	de
j                  fdZd	 Zd
 Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej&                     deej&                     deej(                     deej*                     deej&                     deej&                     deej,                     deeeej&                           dee   deej(                     deej,                     dee   dee   dee   dee   deej(                     deeej&                     ef   f"d       Z xZS )"Pix2StructForConditionalGenerationrI   rX   zdecoder.lm_head.weightc                     t         |   |       t        |j                        | _        t        |j                        | _        |j                  | _        | j                          y rL   )
r$   r%   r  vision_configr  r   r   decoderis_vqar  rW   s     r/   r%   z+Pix2StructForConditionalGeneration.__init__r  sK     ,V-A-AB*6+=+=>mm 	r0   c                 6    | j                   j                         S rL   )r  r  r  s    r/   r  z7Pix2StructForConditionalGeneration.get_input_embeddings}  s    ||0022r0   c                 :    | j                   j                  |       y rL   )r  r  r  s     r/   r  z7Pix2StructForConditionalGeneration.set_input_embeddings  s    )).9r0   rJ   c                 6    | j                   j                         S rL   )r  get_output_embeddingsr  s    r/   r  z8Pix2StructForConditionalGeneration.get_output_embeddings  s    ||1133r0   c                 :    | j                   j                  |       y rL   )r  set_output_embeddingsr  s     r/   r  z8Pix2StructForConditionalGeneration.set_output_embeddings  s    **>:r0   c                     | j                   S rL   )r  r  s    r/   get_decoderz.Pix2StructForConditionalGeneration.get_decoder      ||r0   c                     | j                   S rL   )r  r  s    r/   get_encoderz.Pix2StructForConditionalGeneration.get_encoder  r  r0   r   r   r   r   decoder_head_maskr  r#  r  r  decoder_inputs_embedsrj  r   r   r   rX  c                 $   ||n| j                   j                  j                  }||n| j                   j                  }|| j	                  ||||||      }nI|rGt        |t              s7t        |d   t        |      dkD  r|d   ndt        |      dkD  r|d   nd      }|d   }|
U|S|Q| j                  |
      }||n2|j                  | j                   j                        j                         }d|dddf<   | j                  ||||	||||||||
||      }|s||z   S t        |j                  |j                  |j                   |j"                  |j$                  |j&                  |j(                  |j"                  |j$                  	      S )	a  
        flattened_patches (`torch.FloatTensor` of shape `(batch_size, seq_length, hidden_size)`):
            Flattened pixel patches. the `hidden_size` is obtained by the following formula: `hidden_size` =
            `num_channels` * `patch_size` * `patch_size`

            The process of flattening the pixel patches is done by `Pix2StructProcessor`.
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            Pix2StructText uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).

            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [Pix2StructText
            Training](./t5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
            `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss for the decoder.

        Example:

        Inference:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Pix2StructForConditionalGeneration

        >>> processor = AutoProcessor.from_pretrained("google/pix2struct-textcaps-base")
        >>> model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-textcaps-base")

        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> # autoregressive generation
        >>> generated_ids = model.generate(**inputs, max_new_tokens=50)
        >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        >>> print(generated_text)
        A stop sign is on a street corner.

        >>> # conditional generation
        >>> text = "A picture of"
        >>> inputs = processor(text=text, images=image, return_tensors="pt", add_special_tokens=False)

        >>> generated_ids = model.generate(**inputs, max_new_tokens=50)
        >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        >>> print(generated_text)
        A picture of a stop sign with a red stop sign
        ```

        Training:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Pix2StructForConditionalGeneration

        >>> processor = AutoProcessor.from_pretrained("google/pix2struct-base")
        >>> model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-base")

        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> text = "A stop sign is on the street corner."

        >>> inputs = processor(images=image, return_tensors="pt")
        >>> labels = processor(text=text, return_tensors="pt").input_ids

        >>> # forward pass
        >>> outputs = model(**inputs, labels=labels)
        >>> loss = outputs.loss
        >>> print(f"{loss.item():.5f}")
        5.94282
        ```N)rX   r   r   r   r   r   r   r   r2   r   )r   r   r  r  r  r  r   r  rj  r   r   r  r   rX  )	r  r  r  decoder_hidden_statesdecoder_attentionsr  encoder_last_hidden_stater  encoder_attentions)rI   r   rj  r  r  r   r   lenr	  ner  r   r  r   r  r  r  r=   r   r  r   )r+   rX   r   r   r   r   r  r  r#  r  r  r  rj  r   r   r   rX  r=   decoder_outputss                      r/   r?   z*Pix2StructForConditionalGeneration.forward  s   d "+!6IDKK<S<S<]<]	%0%<k$++B]B] ""ll"3-#"3%9' + O O_!M-"1!"4474H14Loa0RV14_1E1I?1-tO (*"3";@U@] $ 1 1& 9 *5 '&))$++*B*BCIIK # ,-"1a4( ,,'1/+"/#1'!5/!5#) ' 
" "_44 %%"))+;;"1"?"?.99,==&5&G&G"1"?"?.99

 
	
r0   )NNNNNNNNNNNNNNNN)rA   rB   rC   r   r
  r&  r  r%   r  r  r   Moduler  r  r  r  r   r   r'   r  r  
BoolTensorra   r   r   r   r   r   r?   rD   rE   s   @r/   r  r  h  s    )O23	/ 	3:4ryy 4;  :>6:8<=A159=7;EI+/-18<$(,0/3&*59#q
#E$5$56q
 !!2!23q
 $E$4$45	q

 !))9)9 :q
 E--.q
 $E$5$56q
 'u||4q
 "%e.?.?(@"ABq
 "%q
 ))*q
  (5q
 D>q
 $D>q
 'tnq
  d^!q
" !!1!12#q
$ 
uU&&');;	<%q
 q
r0   r  )r   r  r  r   )Gr`   rF  typingr   r   r'   torch.utils.checkpointr   activationsr   cache_utilsr   r	   r
   
generationr   modeling_attn_mask_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_utilsr   utilsr   r   r   r   r   r   r   configuration_pix2structr   r   r   !torch.nn.attention.flex_attentionr   integrations.flex_attentionr    
get_loggerrA   r=  r  r"   apex.normalizationrF   infoImportError	Exceptionr  rH   rc   r   r   r   r   r  r   r1  r   ru  r~  r  r   r  __all__r   r0   r/   <module>r     s      "    ! C C ) > 9  .   e d  !;J 
		H	%+")) +2	/&
KKij! !H^		 ^D")) :(6 (V)
bii )
X y! y! y!x l
5 l
 l
`ryy :BII  Sbii Sn!ryy !J#		 #LT+4 T+n 
n3 n
nb 
T
)BO T

T
nQ/  	 	
NN_`	s   *G* *H2H
H