
    rh\                     J   d Z ddlZddlZddlmZmZ ddlZddlmZ ddl	mc m
Z ddlZddlmZmZmZ ddlmZ ddlmZ ddlmZmZmZmZ dd	lmZ dd
lmZ ddlmZmZm Z  ddl!m"Z"  ejF                  e$      Z% G d dejL                        Z' G d dejL                        Z(e G d de             Z) G d dejL                        Z* G d dejL                        Z+ G d dejL                        Z, G d de      Z- G d dejL                        Z. G d dejL                        Z/ G d  d!ejL                        Z0e G d" d#e)             Z1 G d$ d%ejL                        Z2 ed&'       G d( d)e)             Z3e G d* d+e)             Z4 ed,'       G d- d.e)             Z5g d/Z6y)0zPyTorch LayoutLMv3 model.    N)OptionalUnion)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)apply_chunking_to_forward)auto_docstringlogging	torch_int   )LayoutLMv3Configc                   *     e Zd ZdZ fdZddZ xZS )LayoutLMv3PatchEmbeddingszLayoutLMv3 image (patch) embeddings. This class also automatically interpolates the position embeddings for varying
    image sizes.c                    t         |           t        |j                  t        j
                  j                        r|j                  n|j                  |j                  f}t        |j                  t        j
                  j                        r|j                  n|j                  |j                  f}|d   |d   z  |d   |d   z  f| _        t        j                  |j                  |j                  ||      | _        y )Nr   r   )kernel_sizestride)super__init__
isinstance
input_sizecollectionsabcIterable
patch_sizepatch_shapennConv2dnum_channelshidden_sizeproj)selfconfig
image_sizer"   	__class__s       /var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/modeling_layoutlmv3.pyr   z"LayoutLMv3PatchEmbeddings.__init__4   s     &++[__-E-EF ##V%6%67 	 &++[__-E-EF ##V%6%67 	
 'qMZ]:JqMZXY]<Z[IIf1163E3ES]fpq	    c                 l   | j                  |      }||j                  d| j                  d   | j                  d   d      }|j                  dddd      }|j                  d   |j                  d   }}t        j                  |||fd      }||z   }|j                  d      j                  dd      }|S )Nr   r   r      bicubic)sizemode)	r(   viewr#   permuteshapeFinterpolateflatten	transpose)r)   pixel_valuesposition_embedding
embeddingspatch_heightpatch_widths         r-   forwardz!LayoutLMv3PatchEmbeddings.forwardD   s    YY|,
)!3!8!8D<L<LQ<OQUQaQabcQdfh!i!3!;!;Aq!Q!G(2(8(8(;Z=M=Ma=P+L!"/AWbHcjs!t#&88J''*44Q:
r.   N__name__
__module____qualname____doc__r   rA   __classcell__r,   s   @r-   r   r   0   s    r r.   r   c                   F     e Zd ZdZ fdZd Zd Zd Z	 	 	 	 	 ddZ xZ	S )LayoutLMv3TextEmbeddingszm
    LayoutLMv3 text embeddings. Same as `RobertaEmbeddings` but with added spatial (layout) embeddings.
    c                 .   t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  |j
                        | _	        t        j                  |j
                  |j                        | _
        t        j                  |j                        | _        | j                  dt!        j"                  |j$                        j'                  d      d       |j                  | _        t        j                  |j$                  |j
                  | j(                        | _        t        j                  |j,                  |j.                        | _        t        j                  |j,                  |j.                        | _        t        j                  |j,                  |j4                        | _        t        j                  |j,                  |j4                        | _        y )N)padding_idxepsposition_ids)r   r0   F)
persistent)r   r   r$   	Embedding
vocab_sizer'   pad_token_idword_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_buffertorcharangemax_position_embeddingsexpandrM   position_embeddingsmax_2d_position_embeddingscoordinate_sizex_position_embeddingsy_position_embeddings
shape_sizeh_position_embeddingsw_position_embeddingsr)   r*   r,   s     r-   r   z!LayoutLMv3TextEmbeddings.__init__X   s}   !||F,=,=v?Q?Q_e_r_rs%'\\&2H2H&J\J\%]"f&8&8f>S>STzz&"<"<= 	ELL)G)GHOOPWXej 	 	
 "..#%<<**F,>,>DL\L\$
  &(\\&2S2SU[UkUk%l"%'\\&2S2SU[UkUk%l"%'\\&2S2SU[UfUf%g"%'\\&2S2SU[UfUf%g"r.   c           	      H   	 | j                  |d d d d df         }| j                  |d d d d df         }| j                  |d d d d df         }| j                  |d d d d df         }| j                  t	        j
                  |d d d d df   |d d d d df   z
  dd            }| j                  t	        j
                  |d d d d df   |d d d d df   z
  dd            }t	        j                  ||||||gd      }	|	S # t        $ r}t        d      |d }~ww xY w)	Nr   r   r1   r   z;The `bbox` coordinate values should be within 0-1000 range.i  r0   dim)re   rf   
IndexErrorrh   r^   clipri   cat)
r)   bboxleft_position_embeddingsupper_position_embeddingsright_position_embeddingslower_position_embeddingserh   ri   spatial_position_embeddingss
             r-   %calculate_spatial_position_embeddingsz>LayoutLMv3TextEmbeddings.calculate_spatial_position_embeddingso   sN   	c'+'A'A$q!Qw-'P$(,(B(B41a=(Q%(,(B(B41a=(Q%(,(B(B41a=(Q% !% : :5::d1aQR7mVZ[\^_ab[bVcFcefhl;m n $ : :5::d1aQR7mVZ[\^_ab[bVcFcefhl;m n ',ii()))%% 
'
# +*%  	cZ[abb	cs   A,D 	D!DD!c                     |j                  |      j                         }t        j                  |d      j	                  |      |z  }|j                         |z   S )z
        Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding
        symbols are ignored. This is modified from fairseq's `utils.make_positions`.
        r   rl   )neintr^   cumsumtype_aslong)r)   	input_idsrM   maskincremental_indicess        r-   "create_position_ids_from_input_idsz;LayoutLMv3TextEmbeddings.create_position_ids_from_input_ids   sP     ||K(,,.$||Da8@@F$N"'')K77r.   c                    |j                         dd }|d   }t        j                  | j                  dz   || j                  z   dz   t        j                  |j
                        }|j                  d      j                  |      S )z
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
        Nr0   r   dtypedevicer   )r3   r^   r_   rM   r~   r   	unsqueezera   )r)   inputs_embedsinput_shapesequence_lengthrP   s        r-   &create_position_ids_from_inputs_embedsz?LayoutLMv3TextEmbeddings.create_position_ids_from_inputs_embeds   s     $((*3B/%a.||q /D4D4D"Dq"HPUPZPZcpcwcw
 %%a(//<<r.   c                 N   |I|6| j                  || j                        j                  |j                        }n| j	                  |      }||j                         }n|j                         d d }|:t        j                  |t        j                  | j                  j                        }|| j                  |      }| j                  |      }||z   }| j                  |      }	||	z  }| j                  |      }
||
z   }| j                  |      }| j                  |      }|S )Nr0   r   )r   rM   tor   r   r3   r^   zerosr~   rP   rU   rW   rb   rx   rX   r\   )r)   r   rq   token_type_idsrP   r   r   rW   r>   rb   rw   s              r-   rA   z LayoutLMv3TextEmbeddings.forward   s+    $#FFyRVRbRbcff$$   $JJ=Y #..*K',,.s3K!"[[EJJtO`O`OgOghN  00;M $ : :> J"%::
"66|D))
&*&P&PQU&V#"==
^^J/
\\*-
r.   )NNNNN)
rD   rE   rF   rG   r   rx   r   r   rA   rH   rI   s   @r-   rK   rK   S   s3    h.+48
= 'r.   rK   c                   "    e Zd ZU eed<   dZd Zy)LayoutLMv3PreTrainedModelr*   
layoutlmv3c                    t        |t        j                  t        j                  f      rm|j                  j
                  j                  d| j                  j                         |j                  %|j                  j
                  j                          yyt        |t        j                        rz|j                  j
                  j                  d| j                  j                         |j                  2|j                  j
                  |j                     j                          yyt        |t        j                        rJ|j                  j
                  j                          |j                  j
                  j                  d       yt        |t              r`| j                  j                   rI|j"                  j
                  j                          |j$                  j
                  j                          yyy)zInitialize the weights        )meanstdNg      ?)r   r$   Linearr%   weightdatanormal_r*   initializer_rangebiaszero_rR   rM   rX   fill_LayoutLMv3Modelvisual_embed	cls_token	pos_embed)r)   modules     r-   _init_weightsz'LayoutLMv3PreTrainedModel._init_weights   s[   fryy"))45 MM&&CT[[5R5R&S{{&  &&( '-MM&&CT[[5R5R&S!!-""6#5#56<<> .-KK""$MM$$S)0{{''  %%++-  %%++- ( 1r.   N)rD   rE   rF   r   __annotations__base_model_prefixr    r.   r-   r   r      s    $.r.   r   c                   8     e Zd Z fdZddZ	 	 	 	 	 ddZ xZS )LayoutLMv3SelfAttentionc                    t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                        | _        |j"                  | _        |j$                  | _        y )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ())r   r   r'   num_attention_headshasattr
ValueErrorr{   attention_head_sizeall_head_sizer$   r   querykeyvaluerZ   attention_probs_dropout_probr\   has_relative_attention_biashas_spatial_attention_biasrj   s     r-   r   z LayoutLMv3SelfAttention.__init__   s8    : ::a?PVXhHi#F$6$6#7 8 445Q8 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF+1+M+M(*0*K*K'r.   c                     ||z  }|j                  d      j                  d      }||z
  |z  } t        j                  d      |      S )a  
        https://huggingface.co/papers/2105.13290 Section 2.4 Stabilization of training: Precision Bottleneck Relaxation
        (PB-Relax). A replacement of the original nn.Softmax(dim=-1)(attention_scores). Seems the new attention_probs
        will result in a slower speed and a little bias. Can use torch.allclose(standard_attention_probs,
        cogview_attention_probs, atol=1e-08) for comparison. The smaller atol (e.g., 1e-08), the better.
        r0   rl   )amaxr   r$   Softmax)r)   attention_scoresalphascaled_attention_scores	max_valuenew_attention_scoress         r-   cogview_attentionz)LayoutLMv3SelfAttention.cogview_attention   sT     #3U":+00b0:DDRH	 7) CuL!rzzb!"677r.   c                    |j                   \  }}}	| j                  |      j                  |d| j                  | j                        j                  dd      }
| j                  |      j                  |d| j                  | j                        j                  dd      }| j                  |      j                  |d| j                  | j                        j                  dd      }t        j                  |
t        j                  | j                        z  |j                  dd            }| j                  r5| j                  r)|||z   t        j                  | j                        z  z  }n1| j                  r%||t        j                  | j                        z  z  }|||z   }| j                  |      }| j                  |      }|||z  }t        j                  ||      }|j!                  dddd      j#                         }|j%                         d d | j&                  fz   } |j                  | }|r||f}|S |f}|S )Nr0   r   r1   r   r   )r7   r   r5   r   r   r;   r   r   r^   matmulmathsqrtr   r   r   r\   r6   
contiguousr3   r   )r)   hidden_statesattention_mask	head_maskoutput_attentionsrel_pos
rel_2d_pos
batch_size
seq_length_query_layer	key_layervalue_layerr   attention_probscontext_layernew_context_layer_shapeoutputss                     r-   rA   zLayoutLMv3SelfAttention.forward  s$    %2$7$7!
JJJ}%T*b$":":D<T<TUYq!_ 	 HH]#T*b$":":D<T<TUYq!_ 	 JJ}%T*b$":":D<T<TUYq!_ 	 !<<dii@X@X6Y(Y[d[n[noqsu[vw++0O0O:!54C[C[9\ \\--$))D4L4L*M MM%/.@ 001AB ,,7  -	9O_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S***,CD6G=/2 O\M]r.   )    NNFNN)rD   rE   rF   r   r   rA   rH   rI   s   @r-   r   r      s$    L(
8 <r.   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )LayoutLMv3SelfOutputc                 (   t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                        | _
        y NrN   )r   r   r$   r   r'   denserX   rY   rZ   r[   r\   rj   s     r-   r   zLayoutLMv3SelfOutput.__init__E  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r.   r   input_tensorreturnc                 r    | j                  |      }| j                  |      }| j                  ||z         }|S rB   r   r\   rX   r)   r   r   s      r-   rA   zLayoutLMv3SelfOutput.forwardK  7    

=1]3}|'CDr.   rD   rE   rF   r   r^   TensorrA   rH   rI   s   @r-   r   r   D  1    >U\\  RWR^R^ r.   r   c                   0     e Zd Z fdZ	 	 	 	 	 ddZ xZS )LayoutLMv3Attentionc                 b    t         |           t        |      | _        t	        |      | _        y rB   )r   r   r   r)   r   outputrj   s     r-   r   zLayoutLMv3Attention.__init__T  s&    +F3	*62r.   c                 p    | j                  ||||||      }| j                  |d   |      }|f|dd  z   }	|	S )Nr   r   r   r   )r)   r   )
r)   r   r   r   r   r   r   self_outputsattention_outputr   s
             r-   rA   zLayoutLMv3Attention.forwardY  sY     yy! ! 
  ;;|AF#%QR(88r.   r   )rD   rE   rF   r   rA   rH   rI   s   @r-   r   r   S  s    3 r.   r   c                   6     e Zd Z fdZ	 	 	 	 	 ddZd Z xZS )LayoutLMv3Layerc                     t         |           |j                  | _        d| _        t	        |      | _        t        |      | _        t        |      | _	        y Nr   )
r   r   chunk_size_feed_forwardseq_len_dimr   	attentionLayoutLMv3IntermediateintermediateLayoutLMv3Outputr   rj   s     r-   r   zLayoutLMv3Layer.__init__q  sI    '-'E'E$,V426:&v.r.   c                     | j                  ||||||      }|d   }|dd  }	t        | j                  | j                  | j                  |      }
|
f|	z   }	|	S )N)r   r   r   r   r   )r   r   feed_forward_chunkr   r   )r)   r   r   r   r   r   r   self_attention_outputsr   r   layer_outputs              r-   rA   zLayoutLMv3Layer.forwardy  s|     "&/! "0 "
 2!4(,0##T%A%A4CSCSUe
  /G+r.   c                 L    | j                  |      }| j                  ||      }|S rB   )r   r   )r)   r   intermediate_outputr   s       r-   r   z"LayoutLMv3Layer.feed_forward_chunk  s,    "//0@A{{#68HIr.   r   )rD   rE   rF   r   rA   r   rH   rI   s   @r-   r   r   p  s#    / 8r.   r   c                   L     e Zd Z fdZddZd Zd Z	 	 	 	 	 	 	 	 	 ddZ xZS )LayoutLMv3Encoderc                    t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        |j                  | _
        |j                  | _        | j                  rS|j                  | _        |j                  | _        t        j                  | j                  |j                  d      | _        | j                  r|j"                  | _        |j$                  | _        t        j                  | j$                  |j                  d      | _        t        j                  | j$                  |j                  d      | _        y y c c}w )NF)r   )r   r   r*   r$   
ModuleListrangenum_hidden_layersr   layergradient_checkpointingr   r   rel_pos_binsmax_rel_posr   r   rel_pos_biasmax_rel_2d_posrel_2d_pos_binsrel_pos_x_biasrel_pos_y_bias)r)   r*   r   r,   s      r-   r   zLayoutLMv3Encoder.__init__  s   ]]U6KcKcEd#eOF$;#ef
&+#+1+M+M(*0*K*K'++ & 3 3D%11D "		$*;*;V=W=W^c dD**"("7"7D#)#9#9D "$))D,@,@&B\B\ch"iD"$))D,@,@&B\B\ch"iD	 + $fs   E5c                 6   d}|r4|dz  }||dkD  j                         |z  z  }t        j                  |      }n*t        j                  | t        j                  |            }|dz  }||k  }|t        j
                  |j                         |z        t        j
                  ||z        z  ||z
  z  j                  t        j                         z   }	t        j                  |	t        j                  |	|dz
              }	|t        j                  |||	      z  }|S )Nr   r1   r   )r~   r^   absmax
zeros_likelogfloatr   r   min	full_likewhere)
r)   relative_positionbidirectionalnum_bucketsmax_distanceretn	max_exactis_smallval_if_larges
             r-   relative_position_bucketz*LayoutLMv3Encoder.relative_position_bucket  s   AK%)//1K??C		+,A		,,e.>.>?P.QRA  1$	y= !IIaggi)+,txxy8P/QQU`clUlm
"UZZ. yyu|[[\_/]^u{{8Q55
r.   c                    |j                  d      |j                  d      z
  }| j                  || j                  | j                        }t	        j
                         5  | j                  j                  j                         |   j                  dddd      }d d d        |j                         }|S # 1 sw Y   xY w)Nr   r0   r  r  r   r   r   r1   )r   r   r  r  r^   no_gradr	  r   tr6   r   )r)   rP   rel_pos_matr   s       r-   _cal_1d_pos_embz!LayoutLMv3Encoder._cal_1d_pos_emb  s    ",,R0<3I3I"3MM//)))) 0 
 ]]_ 	P''..0027;CCAq!QOG	P$$&	P 	Ps    :B44B=c                    |d d d d df   }|d d d d df   }|j                  d      |j                  d      z
  }|j                  d      |j                  d      z
  }| j                  || j                  | j                        }| j                  || j                  | j                        }t	        j
                         5  | j                  j                  j                         |   j                  dddd      }| j                  j                  j                         |   j                  dddd      }d d d        |j                         }|j                         }||z   }|S # 1 sw Y   0xY w)Nr   r   r   r0   r"  r   r1   )r   r   r  r
  r^   r#  r  r   r$  r6   r  r   )	r)   rq   position_coord_xposition_coord_yrel_pos_x_2d_matrel_pos_y_2d_mat	rel_pos_x	rel_pos_yr   s	            r-   _cal_2d_pos_embz!LayoutLMv3Encoder._cal_2d_pos_emb  st   1a=1a=+55b9<L<V<VWY<ZZ+55b9<L<V<VWY<ZZ11,,,, 2 
	
 11,,,, 2 
	 ]]_ 	V++22446yAII!QPQSTUI++22446yAII!QPQSTUI	V ((*	((*	*
	V 	Vs   A3E%%E.c           	         |rdnd }|rdnd }| j                   r| j                  |      nd }| j                  r| j                  |      nd }t	        | j
                        D ]5  \  }}|r||fz   }|||   nd } |||||||      }|d   }|s-||d   fz   }7 |r||fz   }|st        d |||fD              S t        |||      S )Nr   r   r   r   c              3   $   K   | ]  }|| 
 y wrB   r   ).0vs     r-   	<genexpr>z,LayoutLMv3Encoder.forward.<locals>.<genexpr>  s      
 = s   last_hidden_stater   
attentions)r   r&  r   r.  	enumerater  tupler   )r)   r   rq   r   r   r   output_hidden_statesreturn_dictrP   r?   r@   all_hidden_statesall_self_attentionsr   r   ilayer_modulelayer_head_masklayer_outputss                      r-   rA   zLayoutLMv3Encoder.forward  s     #7BD$5b48<8X8X$&&|4^b373R3RT))$/X\
(4 	POA|#$58H$H!.7.CilO(!%M *!,M &9]1=M<O&O##	P&   1]4D D  "%'   ++*
 	
r.   )Tr      )	NNNFFTNNN)	rD   rE   rF   r   r   r&  r.  rA   rH   rI   s   @r-   r   r     s:    j(."< "7
r.   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )r   c                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y rB   )r   r   r$   r   r'   intermediate_sizer   r   
hidden_actstrr	   intermediate_act_fnrj   s     r-   r   zLayoutLMv3Intermediate.__init__/  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r.   r   r   c                 J    | j                  |      }| j                  |      }|S rB   )r   rG  )r)   r   s     r-   rA   zLayoutLMv3Intermediate.forward7  s&    

=100?r.   r   rI   s   @r-   r   r   .  s#    9U\\ ell r.   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )r   c                 (   t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        y r   )r   r   r$   r   rD  r'   r   rX   rY   rZ   r[   r\   rj   s     r-   r   zLayoutLMv3Output.__init__?  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r.   r   r   r   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S rB   r   r   s      r-   rA   zLayoutLMv3Output.forwardE  r   r.   r   rI   s   @r-   r   r   >  r   r.   r   c                       e Zd Z fdZd Zd Zd ZddZd Zd Z	e
	 	 	 	 	 	 	 	 	 	 	 ddeej                     d	eej                     d
eej                     deej                     deej                     deej                     deej                     deej                     dee   dee   dee   deeef   fd       Z xZS )r   c                    t         |   |       || _        |j                  rt	        |      | _        |j                  rt        |      | _        t        |j                  |j                  z        }t        j                  t        j                  dd|j                               | _        t        j                  t        j                  d||z  dz   |j                               | _        t        j&                  d      | _        t        j*                  |j                   |j,                        | _        t        j&                  |j.                        | _        | j                  j2                  s| j                  j4                  r| j7                  ||f       t        j*                  |j                   d      | _        t;        |      | _        | j?                          y )Nr   r   )prN   )r+   gư>) r   r   r*   
text_embedrK   r>   r   r   patch_embedr{   r   r"   r$   	Parameterr^   r   r'   r   r   rZ   pos_droprX   rY   r[   r\   r   r   init_visual_bboxnormr   encoderinit_weights)r)   r*   r3   r,   s      r-   r   zLayoutLMv3Model.__init__N  sO    6v>DO  9@Dv((6+<+<<=D\\%++aF<N<N*OPDN\\%++aq&J\J\*]^DNJJ-DM\\&*<*<&BWBWXDN::f&@&@ADL{{66$++:`:`%%$%>V%7%7TBDI(0r.   c                 .    | j                   j                  S rB   r>   rU   )r)   s    r-   get_input_embeddingsz$LayoutLMv3Model.get_input_embeddingsk  s    ...r.   c                 &    || j                   _        y rB   rX  )r)   r   s     r-   set_input_embeddingsz$LayoutLMv3Model.set_input_embeddingsn  s    */'r.   c                     |j                         D ]7  \  }}| j                  j                  |   j                  j	                  |       9 y)z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsrU  r  r   prune_heads)r)   heads_to_pruner  headss       r-   _prune_headszLayoutLMv3Model._prune_headsq  sE    
 +002 	CLE5LLu%//;;EB	Cr.   c           	         t        j                  t        j                  d||d   dz   z  |      |d   d      }t        j                  t        j                  d||d   dz   z  |      |d   d      }t        j                  |dd j	                  |d   d      |dd j	                  |d   d      j                  dd      |dd j	                  |d   d      |dd j	                  |d   d      j                  dd      gd      j                  dd      }t        j                  dd|dz
  |dz
  gg      }t        j                  ||gd      | _	        y)	zJ
        Create the bounding boxes for the visual (patch) tokens.
        r   r   trunc)rounding_modeNr0   rl      )
r^   divr_   stackrepeatr;   r5   tensorrp   visual_bbox)r)   r+   max_lenvisual_bbox_xvisual_bbox_yrj  cls_token_boxs          r-   rS  z LayoutLMv3Model.init_visual_bboxy  s^    		LLGz!}q'897CZPQ]bi
 		LLGz!}q'897CZPQ]bi
 kkcr"))*Q-;cr"))*Q-;EEaKab!((A:ab!((A:DDQJ	 
 $r1+ 	 ueWq['A+&N%OP 99m[%AqIr.   c                     | j                   j                  |dd      }|j                  |      j                  |      }|S r   )rj  rh  r   type)r)   r   r   r   rj  s        r-   calculate_visual_bboxz%LayoutLMv3Model.calculate_visual_bbox  s;    &&--j!Q?!nnV,11%8r.   c                 6   | j                  |      }|j                         \  }}}| j                  j                  |dd      }t	        j
                  ||fd      }| j                  || j                  z   }| j                  |      }| j                  |      }|S )Nr0   r   rl   )	rP  r3   r   ra   r^   rp   r   rR  rT  )r)   r<   r>   r   seq_lenr   
cls_tokenss          r-   forward_imagezLayoutLMv3Model.forward_image  s    %%l3
 ",!2
GQ^^**:r2>
YY
J7Q?
 >>%#dnn4J]]:.
YYz*
r.   r   rq   r   r   rP   r   r   r<   r   r9  r:  r   c                 X	   |	|	n| j                   j                  }	|
|
n| j                   j                  }
||n| j                   j                  }|"|j	                         }|\  }}|j
                  }nL|%|j	                         dd }|\  }}|j
                  }n%|t        |      }|j
                  }nt        d      |||t        j                  |f|      }|&t        j                  t        j                  |      }|<t        j                  t        t              dgz         t        j                  |      }| j                  |||||      }dx}}dx}}|&t        |j                   d   | j                   j"                  z        t        |j                   d	   | j                   j"                  z        }}| j%                  |      }t        j                  ||j                   d
   ft        j                  |      }|t        j&                  ||gd
      }n|}| j                   j(                  s| j                   j*                  r| j                   j*                  r@| j-                  |t        j                  |      }|t        j&                  ||gd
      }n|}t        j.                  d|j                   d
   t        j                  |      j1                  |d
      }||Ut        j.                  dd
   |      j3                  d      }|j5                  |      }t        j&                  ||gd
      }n|}||t        j&                  |gd
      }n|}| j7                  |      }| j9                  |      }n| j                   j(                  s| j                   j*                  rc| j                   j*                  r|}| j                   j(                  r5| j                  j:                  dddd
   f   }|j=                  |      }|}| j?                  |d|j@                        }| jC                  || j                   jD                        }| jG                  ||||||	|
|||
      }|d   }|s	|f|d
d z   S tI        ||jJ                  |jL                        S )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, token_sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Note that `sequence_length = token_sequence_length + patch_sequence_length + 1` where `1` is for [CLS]
            token. See `pixel_values` for `patch_sequence_length`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        bbox (`torch.LongTensor` of shape `(batch_size, token_sequence_length, 4)`, *optional*):
            Bounding boxes of each input sequence tokens. Selected in the range `[0,
            config.max_2d_position_embeddings-1]`. Each bounding box should be a normalized version in (x0, y0, x1, y1)
            format, where (x0, y0) corresponds to the position of the upper left corner in the bounding box, and (x1,
            y1) represents the position of the lower right corner.

            Note that `sequence_length = token_sequence_length + patch_sequence_length + 1` where `1` is for [CLS]
            token. See `pixel_values` for `patch_sequence_length`.
        token_type_ids (`torch.LongTensor` of shape `(batch_size, token_sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            Note that `sequence_length = token_sequence_length + patch_sequence_length + 1` where `1` is for [CLS]
            token. See `pixel_values` for `patch_sequence_length`.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `(batch_size, token_sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            Note that `sequence_length = token_sequence_length + patch_sequence_length + 1` where `1` is for [CLS]
            token. See `pixel_values` for `patch_sequence_length`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, token_sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.

        Examples:

        ```python
        >>> from transformers import AutoProcessor, AutoModel
        >>> from datasets import load_dataset

        >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
        >>> model = AutoModel.from_pretrained("microsoft/layoutlmv3-base")

        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
        >>> example = dataset[0]
        >>> image = example["image"]
        >>> words = example["tokens"]
        >>> boxes = example["bboxes"]

        >>> encoding = processor(image, words, boxes=boxes, return_tensors="pt")

        >>> outputs = model(**encoding)
        >>> last_hidden_states = outputs.last_hidden_state
        ```Nr0   zEYou have to specify either input_ids or inputs_embeds or pixel_values)r   r   re  )r   rq   rP   r   r   r1   r   r   rl   )r   r   r   )r   )	rq   rP   r   r   r   r9  r:  r?   r@   r4  )'r*   r   r9  use_return_dictr3   r   lenr   r^   onesr   r~   r8  listr>   r   r7   r"   ru  rp   r   r   rq  r_   rh  r   ra   rX   r\   rP   	expand_asget_extended_attention_maskr   get_head_maskr  rU  r   r   r6  )r)   r   rq   r   r   rP   r   r   r<   r   r9  r:  r   r   r   r   embedding_output
final_bboxfinal_position_idsr?   r@   visual_embeddingsvisual_attention_maskrj  visual_position_idsextended_attention_maskencoder_outputssequence_outputs                               r-   rA   zLayoutLMv3Model.forward  s   \ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] #..*K%0"J
%%F&',,.s3K%0"J
"))F%\*J!((Fdee M$=%!&j*-Ev!V%!&[

SY!Z|{{5k):aS)@#A\bc##)-+  /   +/.
'%)){#,,,Q/$++2H2HHI,,,Q/$++2H2HHI &L !% 2 2< @$)JJ.44Q78

SY%! )!&N<Q+RXY!Z!6{{66$++:`:`;;99"&"<"<V5::bl"<"mK'%*YYk/B%J
%0
&+ll(..q1F'&Q' $ (M,E#(<<;q>&#Q#[#[\]#^L#/#6#6{#CL).LBU3V\])^&)<&$(A#(99.>@Q-RXY#Z #4 #~~.>?#||,<=[[448^8^{{55!
{{66#;;A?OQ?O<OP+55i@%1"040P0PD&0@0F0F 1Q 1
 &&y$++2O2OP	,,+2/!5#%# ' 
 *!,#%(;;;-)77&11
 	
r.   ))   r  i  )NNNNNNNNNNN)rD   rE   rF   r   rY  r[  ra  rS  rq  ru  r   r   r^   
LongTensorFloatTensorboolr   r8  r   rA   rH   rI   s   @r-   r   r   L  sQ   :/0CJ.
"  15+/6:5937155948,0/3&*C
E,,-C
 u''(C
 !!2!23	C

 !!1!12C
 u//0C
 E--.C
   1 12C
 u001C
 $D>C
 'tnC
 d^C
 
uo%	&C
 C
r.   r   c                   *     e Zd ZdZd fd	Zd Z xZS )LayoutLMv3ClassificationHeadz\
    Head for sentence-level classification tasks. Reference: RobertaClassificationHead
    c                    t         |           || _        |r3t        j                  |j
                  dz  |j
                        | _        n/t        j                  |j
                  |j
                        | _        |j                  |j                  n|j                  }t        j                  |      | _
        t        j                  |j
                  |j                        | _        y )Nr   )r   r   pool_featurer$   r   r'   r   classifier_dropoutr[   rZ   r\   
num_labelsout_proj)r)   r*   r  r  r,   s       r-   r   z%LayoutLMv3ClassificationHead.__init__r  s    (6#5#5#96;M;MNDJ6#5#5v7I7IJDJ)/)B)B)NF%%TZTnTn 	 zz"45		&"4"4f6G6GHr.   c                     | j                  |      }| j                  |      }t        j                  |      }| j                  |      }| j	                  |      }|S rB   )r\   r   r^   tanhr  )r)   xs     r-   rA   z$LayoutLMv3ClassificationHead.forward  sI    LLOJJqMJJqMLLOMM!r.   )FrC   rI   s   @r-   r  r  m  s    Ir.   r  a  
    LayoutLMv3 Model with a token classification head on top (a linear layer on top of the final hidden states) e.g.
    for sequence labeling (information extraction) tasks such as [FUNSD](https://guillaumejaume.github.io/FUNSD/),
    [SROIE](https://rrc.cvc.uab.es/?ch=13), [CORD](https://github.com/clovaai/cord) and
    [Kleister-NDA](https://github.com/applicaai/kleister-nda).
    )custom_introc                       e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	eej                     d
ee	   dee	   dee	   deej                     de
eef   fd       Z xZS ) LayoutLMv3ForTokenClassificationc                 p   t         |   |       |j                  | _        t        |      | _        t        j                  |j                        | _        |j                  dk  r0t        j                  |j                  |j                        | _        nt        |d      | _        | j                          y )N
   Fr  )r   r   r  r   r   r$   rZ   r[   r\   r   r'   
classifierr  rV  rj   s     r-   r   z)LayoutLMv3ForTokenClassification.__init__  s      ++)&1zz&"<"<=r! ii(:(:F<M<MNDO:6PUVDOr.   r   rq   r   r   rP   r   r   labelsr   r9  r:  r<   r   c                    ||n| j                   j                  }| j                  ||||||||	|
||      }||j                         }n|j                         dd }|d   }|d   ddd|f   }| j	                  |      }| j                  |      }d}|<t               } ||j                  d| j                        |j                  d            }|s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )a!  
        bbox (`torch.LongTensor` of shape `(batch_size, sequence_length, 4)`, *optional*):
            Bounding boxes of each input sequence tokens. Selected in the range `[0,
            config.max_2d_position_embeddings-1]`. Each bounding box should be a normalized version in (x0, y0, x1, y1)
            format, where (x0, y0) corresponds to the position of the upper left corner in the bounding box, and (x1,
            y1) represents the position of the lower right corner.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.

        Examples:

        ```python
        >>> from transformers import AutoProcessor, AutoModelForTokenClassification
        >>> from datasets import load_dataset

        >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
        >>> model = AutoModelForTokenClassification.from_pretrained("microsoft/layoutlmv3-base", num_labels=7)

        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
        >>> example = dataset[0]
        >>> image = example["image"]
        >>> words = example["tokens"]
        >>> boxes = example["bboxes"]
        >>> word_labels = example["ner_tags"]

        >>> encoding = processor(image, words, boxes=boxes, word_labels=word_labels, return_tensors="pt")

        >>> outputs = model(**encoding)
        >>> loss = outputs.loss
        >>> logits = outputs.logits
        ```N)
rq   r   r   rP   r   r   r   r9  r:  r<   r0   r   r   losslogitsr   r6  )r*   rw  r   r3   r\   r  r   r5   r  r   r   r6  )r)   r   rq   r   r   rP   r   r   r  r   r9  r:  r<   r   r   r   r  r  r  loss_fctr   s                        r-   rA   z(LayoutLMv3ForTokenClassification.forward  s:   ^ &1%<k$++B]B]//))%'/!5#% " 
  #..*K',,.s3K ^
!!*Q^4,,71')HFKKDOO<fkk"oNDY,F)-)9TGf$EvE$!//))	
 	
r.   NNNNNNNNNNNN)rD   rE   rF   r   r   r   r^   r  r  r  r   r8  r   rA   rH   rI   s   @r-   r  r    sK     15+/6:59371559-1,0/3&*37V
E,,-V
 u''(V
 !!2!23	V

 !!1!12V
 u//0V
 E--.V
   1 12V
 ))*V
 $D>V
 'tnV
 d^V
 u//0V
 
u++	,V
 V
r.   r  c                        e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	eej                     d
ee	   dee	   dee	   deej                     deej                     de
eef   fd       Z xZS )LayoutLMv3ForQuestionAnsweringc                     t         |   |       |j                  | _        t        |      | _        t        |d      | _        | j                          y NFr  )r   r   r  r   r   r  
qa_outputsrV  rj   s     r-   r   z'LayoutLMv3ForQuestionAnswering.__init__  sC      ++)&16vERr.   r   r   r   rP   r   r   start_positionsend_positionsr   r9  r:  rq   r<   r   c                 ,   ||n| j                   j                  }| j                  |||||||	|
|||      }|d   }| j                  |      }|j	                  dd      \  }}|j                  d      j                         }|j                  d      j                         }d}||t        |j                               dkD  r|j                  d      }t        |j                               dkD  r|j                  d      }|j                  d      }|j                  d|      }|j                  d|      }t        |      } |||      } |||      }||z   dz  }|s||f|dd z   }||f|z   S |S t        ||||j                  |j                  	      S )
a  
        bbox (`torch.LongTensor` of shape `(batch_size, sequence_length, 4)`, *optional*):
            Bounding boxes of each input sequence tokens. Selected in the range `[0,
            config.max_2d_position_embeddings-1]`. Each bounding box should be a normalized version in (x0, y0, x1, y1)
            format, where (x0, y0) corresponds to the position of the upper left corner in the bounding box, and (x1,
            y1) represents the position of the lower right corner.

        Examples:

        ```python
        >>> from transformers import AutoProcessor, AutoModelForQuestionAnswering
        >>> from datasets import load_dataset
        >>> import torch

        >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
        >>> model = AutoModelForQuestionAnswering.from_pretrained("microsoft/layoutlmv3-base")

        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
        >>> example = dataset[0]
        >>> image = example["image"]
        >>> question = "what's his name?"
        >>> words = example["tokens"]
        >>> boxes = example["bboxes"]

        >>> encoding = processor(image, question, words, boxes=boxes, return_tensors="pt")
        >>> start_positions = torch.tensor([1])
        >>> end_positions = torch.tensor([3])

        >>> outputs = model(**encoding, start_positions=start_positions, end_positions=end_positions)
        >>> loss = outputs.loss
        >>> start_scores = outputs.start_logits
        >>> end_scores = outputs.end_logits
        ```N
r   r   rP   r   r   r   r9  r:  rq   r<   r   r   r0   rl   )ignore_indexr1   )r  start_logits
end_logitsr   r6  )r*   rw  r   r  splitsqueezer   rx  r3   clampr   r   r   r6  )r)   r   r   r   rP   r   r   r  r  r   r9  r:  rq   r<   r   r  r  r  r  
total_lossignored_indexr  
start_lossend_lossr   s                            r-   rA   z&LayoutLMv3ForQuestionAnswering.forward  s   f &1%<k$++B]B]//))%'/!5#% " 
 "!*1#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J"J/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
r.   )NNNNNNNNNNNNN)rD   rE   rF   r   r   r   r^   r  r  r  r   r8  r   rA   rH   rI   s   @r-   r  r    sd     156:593715596:48,0/3&*+/37d
E,,-d
 !!2!23d
 !!1!12	d

 u//0d
 E--.d
   1 12d
 "%"2"23d
   0 01d
 $D>d
 'tnd
 d^d
 u''(d
 u//0d
 
u22	3d
 d
r.   r  a
  
    LayoutLMv3 Model with a sequence classification head on top (a linear layer on top of the final hidden state of the
    [CLS] token) e.g. for document image classification tasks such as the
    [RVL-CDIP](https://www.cs.cmu.edu/~aharley/rvl-cdip/) dataset.
    c                       e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	ee	   d
ee	   dee	   deej                     deej                     de
eef   fd       Z xZS )#LayoutLMv3ForSequenceClassificationc                     t         |   |       |j                  | _        || _        t	        |      | _        t        |d      | _        | j                          y r  )	r   r   r  r*   r   r   r  r  rV  rj   s     r-   r   z,LayoutLMv3ForSequenceClassification.__init__s  sJ      ++)&16vERr.   r   r   r   rP   r   r   r  r   r9  r:  rq   r<   r   c                 6   |
|
n| j                   j                  }
| j                  ||||||||	|
||      }|d   dddddf   }| j                  |      }d}|| j                   j                  | j
                  dk(  rd| j                   _        nl| j
                  dkD  rL|j                  t        j                  k(  s|j                  t        j                  k(  rd| j                   _        nd| j                   _        | j                   j                  dk(  rIt               }| j
                  dk(  r& ||j                         |j                               }n |||      }n| j                   j                  dk(  r=t               } ||j                  d| j
                        |j                  d            }n,| j                   j                  dk(  rt               } |||      }|
s|f|dd z   }||f|z   S |S t        |||j                   |j"                  	      S )
a_  
        bbox (`torch.LongTensor` of shape `(batch_size, sequence_length, 4)`, *optional*):
            Bounding boxes of each input sequence tokens. Selected in the range `[0,
            config.max_2d_position_embeddings-1]`. Each bounding box should be a normalized version in (x0, y0, x1, y1)
            format, where (x0, y0) corresponds to the position of the upper left corner in the bounding box, and (x1,
            y1) represents the position of the lower right corner.

        Examples:

        ```python
        >>> from transformers import AutoProcessor, AutoModelForSequenceClassification
        >>> from datasets import load_dataset
        >>> import torch

        >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
        >>> model = AutoModelForSequenceClassification.from_pretrained("microsoft/layoutlmv3-base")

        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
        >>> example = dataset[0]
        >>> image = example["image"]
        >>> words = example["tokens"]
        >>> boxes = example["bboxes"]

        >>> encoding = processor(image, words, boxes=boxes, return_tensors="pt")
        >>> sequence_label = torch.tensor([1])

        >>> outputs = model(**encoding, labels=sequence_label)
        >>> loss = outputs.loss
        >>> logits = outputs.logits
        ```Nr  r   r   
regressionsingle_label_classificationmulti_label_classificationr0   r  )r*   rw  r   r  problem_typer  r   r^   r~   r{   r   r  r   r5   r   r   r   r6  )r)   r   r   r   rP   r   r   r  r   r9  r:  rq   r<   r   r  r  r  r  r   s                      r-   rA   z+LayoutLMv3ForSequenceClassification.forward|  s   \ &1%<k$++B]B]//))%'/!5#% " 
 "!*Q1W-1{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE'!//))	
 	
r.   r  )rD   rE   rF   r   r   r   r^   r  r  r  r   r8  r   rA   rH   rI   s   @r-   r  r  k  sK     156:59371559-1,0/3&*+/37`
E,,-`
 !!2!23`
 !!1!12	`

 u//0`
 E--.`
   1 12`
 ))*`
 $D>`
 'tn`
 d^`
 u''(`
 u//0`
 
u..	/`
 `
r.   r  )r  r  r  r   r   )7rG   r   r   typingr   r   r^   torch.nnr$   torch.nn.functional
functionalr8   torch.utils.checkpointr   r   r   activationsr	   modeling_layersr
   modeling_outputsr   r   r   r   modeling_utilsr   pytorch_utilsr   utilsr   r   r   configuration_layoutlmv3r   
get_loggerrD   loggerModuler   rK   r   r   r   r   r   r   r   r   r   r  r  r  r  __all__r   r.   r-   <module>r     s       "      A A ! 9  . 6 
 7 
		H	% 		  Fsryy sl . . .2]bii ]B299 ")) :(0 (VO
		 O
fRYY  ryy  ]
/ ]
 ]
@	299 6 e
'@ e
e
P o
%> o
 o
d k
*C k
k
\r.   