
    rhD                    t   d Z ddlZddlmZ ddlmZmZmZmZ ddl	Z	ddl
Z	ddl	mZ ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZmZ ddlmZmZ ddlm Z  ddl!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z' ddl(m)Z)m*Z*m+Z+  e&jX                  e-      Z.dRde	j^                  de	j`                  dee1   fdZ2	 dSde	jf                  de	j`                  de	jh                  de1fdZ5dSdZ6e e$d       G d de"                    Z7e e$d       G d  d!e"                    Z8 G d" d#ejr                        Z:	 dTd$ejr                  d%e	j^                  d&e	j^                  d'e	j^                  d(ee	j^                     d)e;d*e;fd+Z< G d, d-ejr                        Z= G d. d/ejr                        Z> G d0 d1e      Z? G d2 d3ejr                        Z@ G d4 d5ejr                        ZA G d6 d7ejr                        ZB G d8 d9ejr                        ZC G d: d;ejr                        ZD G d< d=e      ZE G d> d?ejr                        ZFe$ G d@ dAe             ZG G dB dCeG      ZH G dD dEeG      ZI e$dF       G dG dHeGe             ZJ G dI dJejr                        ZK e$dK       G dL dMeG             ZL e$dN       G dO dPeGe             ZMg dQZNy)UzPyTorch KOSMOS-2 model.    N)	dataclass)AnyCallableOptionalUnion)nn   )ACT2FN)CacheEncoderDecoderCache)GenerationMixin)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsBaseModelOutputWithPooling!CausalLMOutputWithCrossAttentions)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tuplelogging	torch_int   )Kosmos2ConfigKosmos2TextConfigKosmos2VisionConfigmaskdtypetgt_lenc                 2   | j                         \  }}||n|}| ddddddf   j                  |d||      j                  |      }d|z
  }|j                  |j                  t        j
                        t	        j                  |      j                        S )z_
    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
    Nr         ?)sizeexpandtomasked_filltorchboolfinfomin)r!   r"   r#   bszsrc_lenexpanded_maskinverted_masks          /var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/kosmos2/modeling_kosmos2.py_expand_maskr3   -   s     99;LC ,g'GD$)*11#q'7KNNuUM-'M$$]%5%5ejj%A5;;uCUCYCYZZ    input_ids_shapedevicepast_key_values_lengthc                    | \  }}t        j                  ||ft        j                  |      j                  |      }t        j                  |j                  d      |      }|j                  ||dz   j                  |j                  d      d      k  d       |j                  |      }|dkD  r0t        j                  t        j                  ||||      |gd      }|ddddddf   j                  |d|||z         S )zB
    Make causal mask used for bi-directional self-attention.
    )r6   r   r   r"   r6   dimN)r*   fullr,   r-   aranger&   masked_fill_viewr(   catzerosr'   )r5   r"   r6   r7   r.   r#   r!   	mask_conds           r2   _make_causal_maskrD   ;   s     #LC::w(%++e*<*@*@PDTYYr]6:Ii9q="6"6tyy}a"HH!L775>D!yy%++g/EU[abdhioqrdAq !((a'DZ:Z[[r4   c                     | j                  |      j                         }t        j                  |d      j	                  |      |z   |z  }|j                         |z   S )a  
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's `utils.make_positions`.

    Args:
        x: torch.Tensor x:

    Returns: torch.Tensor
    r   r;   )neintr*   cumsumtype_aslong)	input_idspadding_idxr7   r!   incremental_indicess        r2   "create_position_ids_from_input_idsrN   M   sW     <<$((*D <<!4<<TBE[[_cc##%33r4   ze
    Base class for text model's outputs that also contains a pooling of the last hidden states.
    )custom_introc                   @   e Zd ZU dZdZeej                     ed<   dZ	ee
e
ej                           ed<   dZee
ej                        ed<   dZee
ej                        ed<   dZeej                     ed<   dZee
ej                        ed<   dZeed	<   d
e
e   fdZy)Kosmos2ModelOutputa  
    past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
        `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
        `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
        encoder_sequence_length, embed_size_per_head)`.

        Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
        `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
        input) to speed up sequential decoding.
    image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
    projection_attentions (`tuple(torch.FloatTensor)`, *optional*):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights given by `Kosmos2ImageToTextProjection`, after the attention softmax, used to compute
        the weighted average in the self-attention heads.
    vision_model_output (`BaseModelOutputWithPooling`, *optional*):
        The output of the [`Kosmos2VisionModel`].
    Nlast_hidden_statepast_key_valueshidden_states
attentionsimage_embedsprojection_attentionsvision_model_outputreturnc                 H     t         fd j                         D              S )Nc              3   d   K   | ]'  }|d vr|   nt        |      j                          ) yw)text_model_outputrX   Ngetattrto_tuple.0kselfs     r2   	<genexpr>z.Kosmos2ModelOutput.to_tuple.<locals>.<genexpr>   =      
  LLDGRYZ^`aRbRkRkRmm
   -0tuplekeysrd   s   `r2   r`   zKosmos2ModelOutput.to_tuple   #     
YY[
 
 	
r4   )__name__
__module____qualname____doc__rR   r   r*   FloatTensor__annotations__rS   ri   rT   rU   rV   rW   rX   r   r   r`    r4   r2   rQ   rQ   ]   s    , 6:x 1 129AEOXeE%*;*;$<=>E8<M8E%"3"345<59Ju0012904L(5,,-4@D8E%*;*;$<=D6:3:
%* 
r4   rQ   zC
    Model output class for `Kosmos2ForConditionalGeneration`.
    c                   h   e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eeeej                           ed<   dZeeej                        ed<   dZeeej                        ed<   dZeej                     ed<   dZeeej                        ed	<   dZeed
<   dee   fdZy)*Kosmos2ForConditionalGenerationModelOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
        `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
        `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
        encoder_sequence_length, embed_size_per_head)`.

        Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
        `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
        input) to speed up sequential decoding.
    image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
    projection_attentions (`tuple(torch.FloatTensor)`, *optional*):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights given by `Kosmos2ImageToTextProjection`, after the attention softmax, used to compute
        the weighted average in the self-attention heads.
    vision_model_output (`BaseModelOutputWithPooling`, *optional*):
        The output of the [`Kosmos2VisionModel`].
    NlosslogitsrS   rT   rU   rV   rW   rX   rY   c                 H     t         fd j                         D              S )Nc              3   d   K   | ]'  }|d vr|   nt        |      j                          ) ywr\   r^   ra   s     r2   re   zFKosmos2ForConditionalGenerationModelOutput.to_tuple.<locals>.<genexpr>   rf   rg   rh   rk   s   `r2   r`   z3Kosmos2ForConditionalGenerationModelOutput.to_tuple   rl   r4   )rm   rn   ro   rp   rv   r   r*   rq   rr   rw   rS   ri   rT   rU   rV   rW   rX   r   r   r`   rs   r4   r2   ru   ru      s    4 )-D(5$$
%,*.FHU&&'.AEOXeE%*;*;$<=>E8<M8E%"3"345<59Ju0012904L(5,,-4@D8E%*;*;$<=D6:3:
%* 
r4   ru   c                        e Zd Zdef fdZdej                  dededej                  fdZd
dej                  dej                  fd	Z
 xZS )Kosmos2VisionEmbeddingsconfigc                    t         |           || _        |j                  | _        |j
                  | _        |j                  | _        t        j                  t        j                  | j                              | _        t        j                  |j                  | j                  | j                  | j                  d      | _        | j
                  | j                  z  dz  | _        | j                  dz   | _        t        j"                  | j                   | j                        | _        | j'                  dt        j(                  | j                         j+                  d      d       y )NF)in_channelsout_channelskernel_sizestridebias   r   position_ids)r   r9   
persistent)super__init__r|   hidden_size	embed_dim
image_size
patch_sizer   	Parameterr*   randnclass_embeddingConv2dnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferr>   r'   rd   r|   	__class__s     r2   r   z Kosmos2VisionEmbeddings.__init__   s	   ++ ++ ++!||EKK,GH!yy++?? 
 !OOt>1D!--1"$,,t/A/A4>>"R^U\\$:L:L-M-T-TU\-]jopr4   
embeddingsheightwidthrY   c                    |j                   d   dz
  }| j                  j                  j                  d      }|j                   d   dz
  }t        j
                  j                         s%||k(  r ||k(  r| j                  | j                        S |ddddf   }|ddddf   }|j                   d   }	|| j                  z  }
|| j                  z  }t        |dz        }|j                  d|||	      }|j                  dddd      }t        j                  j                  ||
|fdd	
      }|j                  dddd      j                  dd|	      }t	        j                   ||fd      S )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r   Nr9   g      ?r	   r   bicubicF)r&   modealign_cornersr;   )shaper   weight	unsqueezer*   jit
is_tracingr   r   r   reshapepermuter   
functionalinterpolater@   rA   )rd   r   r   r   r   r   r   class_pos_embedpatch_pos_embedr<   
new_height	new_widthsqrt_num_positionss                r2   interpolate_pos_encodingz0Kosmos2VisionEmbeddings.interpolate_pos_encoding   sv    !&&q)A-!44;;EEaH*003a7 yy##%+*F6UZ?**4+<+<==,QU3,QU3r"t.
T__,	&}c'9:)11!5GI[]`a)11!Q1=--33i(	 4 
 *11!Q1=BB1b#Nyy/?;CCr4   pixel_valuesc                 `   |j                   \  }}}}|sJ|| j                  k7  s|| j                  k7  r,t        d| d| d| j                   d| j                   d	      | j                  j                  j
                  }| j                  |j                  |            }|j                  d      j                  dd      }| j                  j                  |dd      }	t        j                  |	|gd	      }
|r|
| j                  |
||      z   }
|
S |
| j                  | j                        z   }
|
S )
NzInput image size (*z) doesn't match model ().r"   r   r   r9   r;   )r   r   
ValueErrorr   r   r"   r(   flatten	transposer   r'   r*   rA   r   r   r   )rd   r   r   
batch_size_r   r   target_dtypepatch_embedsclass_embedsr   s              r2   forwardzKosmos2VisionEmbeddings.forward   s6   '3'9'9$
Avu'Vt-F%SWSbSbJb$VHAeW4KDOOK\\]^b^m^m]nnpq  ++2288++LOO,O,OP#++A.88A>++22:q"EYYl;C
##d&C&CJPVX]&^^J  $d&=&=d>O>O&PPJr4   F)rm   rn   ro   r    r   r*   TensorrG   r   rq   r   __classcell__r   s   @r2   r{   r{      se    q2 q,'D5<< 'D 'DUX 'D]b]i]i 'DRE$5$5 Z_ZfZf r4   r{   modulequerykeyvalueattention_maskscalingdropoutc                 p   t        j                  ||j                  dd            |z  }|||z   }t        j                  j                  |d      }t        j                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )Nr9   r;   ptrainingr   r   )	r*   matmulr   r   r   softmaxr   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputs
             r2   eager_attention_forwardr     s     <<s}}R'<=GL!#n4==((2(>L==((6??([L,,|U3K''1-88:K$$r4   c                        e Zd ZdZ fdZ	 	 	 d	dej                  deej                     deej                     dee   de	ej                  eej                     f   f
dZ
 xZS )
Kosmos2VisionAttention=Multi-headed attention from 'Attention Is All You Need' paperc                    t         |           || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        | j                  | j                  z  | j                  k7  r&t        d| j                   d| j                   d      | j                  dz  | _	        |j                  | _        d| _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        y )N;embed_dim must be divisible by num_heads (got `embed_dim`:  and `num_heads`: r         F)r   r   r|   r   r   num_attention_heads	num_headshead_dimr   scaleattention_dropoutr   	is_causalr   Lineark_projv_projq_projout_projr   s     r2   r   zKosmos2VisionAttention.__init__(  s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..Ar4   rT   r   causal_attention_maskoutput_attentionsrY   c           
         |j                   \  }}}| j                  |      }| j                  |      }	| j                  |      }
|j	                  ||| j
                  | j                        j                  dd      }|	j	                  ||| j
                  | j                        j                  dd      }	|
j	                  ||| j
                  | j                        j                  dd      }
| j                  j                  dk7  r||||z   }n||}n	|du| _
        t        }| j                  j                  dk7  rN| j                  j                  dk(  r|rt        j                  d       nt        | j                  j                     } || ||	|
|| j                  | j                  | j                   sdn| j"                  	      \  }}|j%                  |||      j'                         }| j)                  |      }|sd}||fS )
#Input shape: Batch x Time x Channelr   r   flash_attention_2Neagersdpa`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.        )r   r   r   )r   r   r   r   r@   r   r   r   r|   _attn_implementationr   r   loggerwarning_oncer   r   r   r   r   r   r   )rd   rT   r   r   r   r   
seq_lengthr   queriesrj   valuesattention_interfacer   r   s                 r2   r   zKosmos2VisionAttention.forward<  s    -:,?,?)
J	++m,{{=)]+,,z:t~~t}}U__`acdeyyZOYYZ[]^_ZT^^T]]S]]^_abc ;;++/BB).C.O!/2G!G&2!62$>DN(?;;++w6{{//69>O##L
 '>dkk>^>^&_#$7nnJJ#}}C$,,	%
!\ "))*j)LWWYmmK0 LL((r4   )NNF)rm   rn   ro   rp   r   r*   r   r   r+   ri   r   r   r   s   @r2   r   r   %  s}    GB. 268<,15)||5) !.5)  (5	5)
 $D>5) 
u||Xell33	45)r4   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )Kosmos2VisionMLPc                    t         |           || _        t        |j                     | _        t        j                  |j                  |j                        | _
        t        j                  |j                  |j                        | _        y N)r   r   r|   r
   
hidden_actactivation_fnr   r   r   intermediate_sizefc1fc2r   s     r2   r   zKosmos2VisionMLP.__init__v  sd    #F$5$5699V//1I1IJ99V55v7I7IJr4   rT   rY   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r   r   r   rd   rT   s     r2   r   zKosmos2VisionMLP.forward}  s4    /**=9/r4   )rm   rn   ro   r   r*   r   r   r   r   s   @r2   r   r   u  s$    KU\\ ell r4   r   c                        e Zd Zdef fdZ	 d	dej                  dej                  dej                  dee   de	ej                     f
dZ xZS )
Kosmos2VisionEncoderLayerr|   c                 D   t         |           |j                  | _        t	        |      | _        t        j                  | j                  |j                        | _	        t        |      | _        t        j                  | j                  |j                        | _        y Neps)r   r   r   r   r   	self_attnr   	LayerNormlayer_norm_epslayer_norm1r   mlplayer_norm2r   s     r2   r   z"Kosmos2VisionEncoderLayer.__init__  sm    ++/7<<F<Q<QR#F+<<F<Q<QRr4   rT   r   r   r   rY   c                     |}| j                  |      }| j                  ||||      \  }}||z   }|}| j                  |      }| j                  |      }||z   }|f}|r||fz  }|S )aI  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )rT   r   r   r   )r  r  r  r  )rd   rT   r   r   r   residualr   outputss           r2   r   z!Kosmos2VisionEncoderLayer.forward  s    " !((7&*nn')"7/	 '5 '
#| !=0 ((7/ =0 "&Gr4   r   )rm   rn   ro   r    r   r*   r   r   r+   ri   rq   r   r   r   s   @r2   r  r    sg    S2 S -2&||& &  %||	&
 $D>& 
u  	!&r4   r  c                        e Zd ZdZdef fdZe	 	 	 	 	 ddeej                     deej                     dee
   dee
   dee
   d	eeef   fd
       Z xZS )Kosmos2VisionEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`Kosmos2VisionEncoderLayer`].

    Args:
        config: Kosmos2VisionConfig
    r|   c                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w )NF)
r   r   r|   r   
ModuleListrangenum_hidden_layersr  layersgradient_checkpointing)rd   r|   r   r   s      r2   r   zKosmos2VisionEncoder.__init__  sQ    mmPUV\VnVnPo$p1%>v%F$pq&+# %qs   A#r   r   r   output_hidden_statesreturn_dictrY   c                 j   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|rdnd}|rdnd}|}	t	        | j
                        D ]*  \  }
}|r||	fz   } ||	|||      }|d   }	|s"||d   fz   }, |r||	fz   }t        |	||      S )a  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Causal mask for the text model. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nrs   )r   r   r   )rR   rT   rU   )r|   r   r  use_return_dict	enumerater  r   )rd   inputs_embedsr   r   r   r  r  encoder_statesall_attentionsrT   idxencoder_layerlayer_outputss                r2   r   zKosmos2VisionEncoder.forward  s    N 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]30d%"+DKK"8 	FC#!/=2B!B)%"3	M *!,M !/=3C2E!E	F  +}.>>N+>Vd
 	
r4   )NNNNN)rm   rn   ro   rp   r    r   r   r   r*   r   r+   r   ri   r   r   r   r   s   @r2   r  r    s    ,2 ,  268<,0/3&*D
 !.D
  (5	D

 $D>D
 'tnD
 d^D
 
uo%	&D
 D
r4   r  c                        e Zd Zdef fdZ	 	 	 	 	 d
deej                     dee   dee   dedee   de	e
ef   fd	Z xZS )Kosmos2VisionTransformerr|   c                     t         |           || _        |j                  }t	        |      | _        t        j                  ||j                        | _	        t        |      | _        t        j                  ||j                        | _        y r  )r   r   r|   r   r{   r   r   r	  r
  pre_layrnormr  encoderpost_layernorm)rd   r|   r   r   s      r2   r   z!Kosmos2VisionTransformer.__init__  sj    &&	1&9LL8M8MN+F3 ll9&:O:OPr4   r   r   r  r   r  rY   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|t	        d      | j                  ||      }| j                  |      }| j                  ||||      }|d   }|d d dd d f   }	| j                  |	      }	|s
||	f|dd  z   S t        ||	|j                  |j                        S )Nz You have to specify pixel_values)r   )r  r   r  r  r   r   )rR   pooler_outputrT   rU   )r|   r   r  r  r   r   r'  r(  r)  r   rT   rU   )
rd   r   r   r  r   r  rT   encoder_outputsrR   pooled_outputs
             r2   r   z Kosmos2VisionTransformer.forward  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]?@@Ogh))-8,,'/!5#	 ' 
 ,A.)!Q'2++M:%}58KKK)/')77&11	
 	
r4   NNNFN)rm   rn   ro   r    r   r   r*   rq   r+   r   ri   r   r   r   r   s   @r2   r%  r%    s    Q2 Q 59,0/3).&*'
u001'
 $D>'
 'tn	'

 #''
 d^'
 
u00	1'
r4   r%  c                       e Zd ZdZddededee   f fdZddededee   fdZeddededee   fd       Z	 e
j                         	 	 	 	 dd	ee
j                     d
ee
j                     dedee
j                     fd       Zd Z xZS )(Kosmos2TextSinusoidalPositionalEmbeddingzDThis module produces sinusoidal positional embeddings of any length.r   embedding_dimrL   c                     t         |           d| _        || _        || _        | j                  || j                  z   ||       y )Nr   )r   r   offsetr1  rL   make_weights)rd   r   r1  rL   r   s       r2   r   z1Kosmos2TextSinusoidalPositionalEmbedding.__init__K  s@    *&-$++5}kRr4   num_embeddingsc                     | j                  |||      }t        | d      r;|j                  | j                  j                  | j                  j
                        }| j                  d|d       y )Nweightsr:   Fr   )get_embeddinghasattrr(   r7  r"   r6   r   )rd   r5  r1  rL   emb_weightss        r2   r4  z5Kosmos2TextSinusoidalPositionalEmbedding.make_weightsS  s[    ((T4#%..t||/A/A$,,J]J].^KYFr4   c                    |dz  }t        j                  d      |dz
  z  }t        j                  t        j                  |t        j
                        j                         | z        }t        j                  | t        j
                        j                         j                  d      |j                  d      z  }t        j                  t        j                  |      t        j                  |      gd      j                  | d      }|dz  dk(  r-t        j                  |t        j                  | d      gd      }|	d||ddf<   |j                  t        j                               S )	z
        Build sinusoidal embeddings.

        This matches the implementation in tensor2tensor, but differs slightly from the description in Section 3.5 of
        "Attention Is All You Need".
        r   i'  r   r   r   r;   r9   N)mathlogr*   expr>   int64floatr   rA   sincosr@   rB   r(   get_default_dtype)r5  r1  rL   half_dimembs        r2   r8  z6Kosmos2TextSinusoidalPositionalEmbedding.get_embedding[  s    !A%hhuoA.iiXU[[AGGISDPQll>=CCEOOPQRUXUbUbcdUeeii338a@EEnVXY1!))S%++na"@AqIC""#CQvve--/00r4   rK   r  r7   r   c                 v   |F|j                         \  }}|[t        || j                  |      j                  |j                        }n*|j                         d d \  }}|| j                  ||      }| j                  dz   |z   |z   }|| j                  j                  d      kD  r4| j                  || j                  z   | j                  | j                         | j                  j                  d|j                  d            j                  ||| j                  j                  d         j                         S )Nr9   r   r   )r&   rN   rL   r(   r6   &create_position_ids_from_inputs_embedsr7  r4  r3  r1  index_selectr@   r   detach)rd   rK   r  r7   r   r.   seq_lenmax_poss           r2   r   z0Kosmos2TextSinusoidalPositionalEmbedding.forwardq  s'     $>>+LC#At//1G "Y%%&  )--/4LC##JJ=Zpq ""Q&03IIT\\&&q))g3T5G5GIYIYZ||((L,=,=b,ABGGWVZVbVbVhVhikVlmttvvr4   c                 0   |j                         dd }|d   }t        j                  | j                  dz   || j                  z   dz   t        j                  |j
                        }|j                  d      j                  |      j                         |z   S )z
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        Nr9   r   r:   r   )	r&   r*   r>   rL   rJ   r6   r   r'   r   )rd   r  r7   input_shapesequence_lengthr   s         r2   rG  zOKosmos2TextSinusoidalPositionalEmbedding.create_position_ids_from_inputs_embeds  s     $((*3B/%a.||q /D4D4D"Dq"HPUPZPZcpcwcw
 %%a(//<GGILbbbr4   r   )NNr   N)rm   rn   ro   rp   rG   r   r   r4  staticmethodr8  r*   no_gradr   r   rG  r   r   s   @r2   r0  r0  G  s    NSc S# SHUXM SG3 Gs GQYZ]Q^ G 1c 1# 1HUXM 1 1( U]]_ -104&'/3wELL)w  -w !$	w
 u||,w w6cr4   r0  c                   \    e Zd ZdZ	 	 	 	 	 ddedededee   dee   dee   dee   f fd	Z	 	 	 	 	 	 dd
e	j                  dee	j                     dee   dee	j                     dee	j                     dedee	j                     dee	j                  ee	j                     ee   f   fdZ xZS )KosmosTextAttentionr   r   r   r   
is_decoderadd_inner_attn_layernormr   	layer_idxc	                 j   t         	|           || _        || _        || _        || _        ||z  | _        | j                  |z  | j                  k7  rt        d| j                   d| d      | j                  dz  | _        || _	        || _
        t        j                  |||      | _        t        j                  |||      | _        t        j                  |||      | _        t        j                  |||      | _        d | _        |r't        j$                  ||j&                        | _        y y )Nr   r   r   r   )r   r  )r   r   r|   r   r   r   r   r   r   rS  rU  r   r   r   r   r   r   inner_attn_lnr	  r
  )
rd   r|   r   r   r   rS  rT  r   rU  r   s
            r2   r   zKosmosTextAttention.__init__  s    	""!Y.MMI%$..8MdnnM]$YKr3  }}d*$"ii	94@ii	94@ii	94@		)YTB "#!#iV=R=R!SD $r4   rT   encoder_hidden_statespast_key_valuer   layer_head_maskr   cache_positionrY   c                    |du}	|j                   dd \  }
}| j                  |      }|j                  |
|| j                  | j                        j                  dd      }|St        |t              rA|j                  j                  | j                        }|	r|j                  }n|j                  }n|}|	r|n|}|	rK|IrGj                  | j                     j                  }|j                  | j                     j                  }n| j!                  |      }| j#                  |      }|j                  |
d| j                  | j                        j                  dd      }|j                  |
d| j                  | j                        j                  dd      }|D|	s|nd}j%                  ||| j                  d|i      \  }}|	rd|j                  | j                  <   t&        }| j(                  j*                  dk7  rN| j(                  j*                  dk(  r|rt,        j/                  d	       nt0        | j(                  j*                     } || ||||f| j2                  sd
n| j4                  | j6                  d|\  }}|j9                  |
|d      j;                         }| j<                  | j=                  |      }| j?                  |      }||fS )r   Nr   r   r9   r[  Tr   r   r   r   )r   r   ) r   r   r@   r   r   r   
isinstancer   
is_updatedgetrU  cross_attention_cacheself_attention_cacher  rj   r   r   r   updater   r|   r   r   r   r   r   r   r   r   r   rW  r   )rd   rT   rX  rY  r   rZ  r   r[  r   is_cross_attentionr   r   query_statesr^  curr_past_key_valuecurrent_states
key_statesvalue_statesr   r   r   s                        r2   r   zKosmosTextAttention.forward  s    3$>!.!4!4Ra!8
J{{=1#((ZQUQ^Q^_iijkmno%.*=>+66::4>>J
%*8*N*N'*8*M*M'&4#2D.-."<,33DNNCHHJ.55dnnELLL^4J;;~6L#RWaabcefgJ',,ZT^^T]][eefgijkL)7It+>+E+Ednn?OQ_>`,(
L &@DN--dnn=(?;;++w6{{//69>O##L
 '>dkk>^>^&_#$7	%
  $}}C$,,LL	%
 	%
!\ "))*j"EPPR),,[9KmmK0L((r4   )r   FFTN)NNNNFN)rm   rn   ro   rp   rG   r@  r   r+   r   r*   r   r   ri   r   r   r   s   @r2   rR  rR    s4   G %*38#$(#T #T 	#T
 #T TN#T #+4.#T tn#T D>#TP 9=*.1526"'15Q)||Q)  (5Q) !	Q)
 !.Q) "%,,/Q)  Q) !.Q) 
u||Xell3Xe_D	EQ)r4   rR  c                   *     e Zd Zdef fdZd Z xZS )Kosmos2TextFFNr|   c                    t         |           |j                  | _        t        |j                     | _        |j                  | _        t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        y r  )r   r   r   r
   activation_functionr   activation_dropoutr   r   r   ffn_dimr   r   r	  r
  ffn_layernormr   s     r2   r   zKosmos2TextFFN.__init__  s    ~~#F$>$>?"(";";99V--v~~>99V^^V-=-=>\\&..f>S>STr4   c                 b   | j                  | j                  |            }t        j                  j	                  || j
                  | j                        }| j                  |      }| j                  |      }t        j                  j	                  || j                  | j                        }|S )Nr   )	r   r   r   r   r   rm  r   ro  r   r  s     r2   r   zKosmos2TextFFN.forward)  s    **488M+BC--mt?V?Vaeanan-o**=9/--mt||VZVcVc-dr4   )rm   rn   ro   r   r   r   r   r   s   @r2   rj  rj    s    
U0 
Ur4   rj  c                       e Zd Zddef fdZ	 	 	 	 	 	 	 	 	 ddej                  deej                     deej                     deej                     deej                     deej                     d	ee   d
ee	   dee	   deej                     de
ej                  ee
ej                  ej                  f      f   fdZ xZS )Kosmos2TextBlockr|   c           	         t         |           |j                  | _        t        || j                  |j                  |j
                  dd|      | _        |j                  | _        t        j                  | j                  |j                        | _        |j                  ret        || j                  |j                  |j
                  dd|      | _        t        j                  | j                  |j                        | _        t        |      | _        t        j                  | j                  |j                        | _        y )NT)r   r   r   rS  rT  rU  r  F)r   r   r   rR  attention_headsr   r  r   r   r	  r
  self_attn_layer_normadd_cross_attentionencoder_attnencoder_attn_layer_normrj  ffnfinal_layer_norm)rd   r|   rU  r   s      r2   r   zKosmos2TextBlock.__init__4  s    )),nn,,,,%)
 ~~$&LLVEZEZ$[!%% 3.. 0000).#!D ,.<<FLaLa+bD(!&) "T^^AVAV Wr4   rT   r   rX  encoder_attention_maskrZ  cross_attn_layer_head_maskrY  r   	use_cacher[  rY   c                 X   |}| j                  |      } | j                  d||||||
d|\  }}t        j                  j	                  || j                  | j
                        }||z   }d }|t        | d      st        d|  d      |}| j                  |      } | j                  d|||||||
d|\  }}t        j                  j	                  || j                  | j
                        }||z   }|}| j                  |      }| j                  |      }||z   }|f}|r|||fz  }|S )N)rT   rY  r   rZ  r   r[  r   rw  z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`)rT   rX  r   rZ  rY  r   r[  rs   )ru  r  r   r   r   r   r9  r   rx  rw  rz  ry  )rd   rT   r   rX  r{  rZ  r|  rY  r   r}  r[  r   r  self_attn_weightscross_attn_weightsr  s                   r2   r   zKosmos2TextBlock.forwardS  s    !11-@+94>> ,
'))+/),
 ,
(( --mt||VZVcVc-d =0 " ,40 =dV DD D 
 %H 88GM0A0A0A 	1+&;5 :-"3-	1 	1-M- MM11-4<<Z^ZgZg1hM$}4M !--m< / =0 ")+=>>Gr4   r   )	NNNNNNFTN)rm   rn   ro   r   r   r*   r   r   r   r+   ri   rq   r   r   r   s   @r2   rr  rr  3  s   X0 XD 268<9=26=A*.,1$(15C||C !.C  (5	C
 !) 6C "%,,/C %-U\\$:C !C $D>C D>C !.C 
u  (51B1BEDUDU1U+V"WW	XCr4   rr  c            '           e Zd ZdZdef fdZd Z	 	 	 	 	 ddeej                     deej                     deej                     de
d	eej                     f
d
Z	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     deej                     deeej                        deej                     d	eej                     dee   dee   dee   dee   deej                     dee   deeef   f$dZ xZS )Kosmos2TextTransformerz
    Transformer decoder consisting of `config.layers` layers. Each layer is a [`Kosmos2TextBlock`].

    Args:
        config: Kosmos2TextConfig
    r|   c           	         t         |           || _        |j                  | _        |j                  | _        |j
                  rt        j                  |j                        nd| _	        t        j                  |j                  |j                  |j                        | _        t        |j                   |j                  |j                        | _        t        j$                  t'        |j(                        D cg c]  }t+        ||       c}      | _        t        j,                  |j                  |j.                        | _        d| _        y c c}w )Nr%   )rL   )r   r1  rL   )rU  F)r   r   r|   r   	layerdropscale_embeddingr<  sqrtr   embed_scaler   r   
vocab_sizepad_token_idembed_tokensr0  max_position_embeddingsembed_positionsr  r  r  rr  r	  r
  
layer_normr  )rd   r|   ir   s      r2   r   zKosmos2TextTransformer.__init__  s    ~~)):@:P:P499V%5%56VYLL):):F<L<LZ`ZmZmnG 88 **++ 
 mmTYZ`ZgZgTh$iq%5f%J$ij,,v'7'79N9NO&+# %js   =Ec                     d }|d   dkD  r#t        ||j                  |j                  |      }|=t        ||j                  |d         j	                  |j                        }||n||z   }|S )Nr9   r   )r6   r7   r#   )rD   r"   r6   r3   r(   )rd   r   rM  r  r7   combined_attention_maskexpanded_attn_masks          r2   _prepare_decoder_attention_maskz6Kosmos2TextTransformer._prepare_decoder_attention_mask  s     #'r?Q&7##$++'=	'# %!-nm>Q>Q[fgi[j!k!n!n$$" '>&E"K]`wKw $ '&r4   r  rV   img_input_maskr7   r   c                    || j                  |      }|[|j                  |j                        j                  d|j	                  d            ||j                  t
        j                        <   || j                  z  }| j                  ||||      }|j                  |j                        }||z   }t        j                  j                  || j                  | j                        }|S )Nr9   r   )rK   r  r7   r   r   )r  r(   r6   r@   r&   r*   r+   r  r  r   r   r   r   )	rd   rK   r  rV   r  r7   r   	positionsrT   s	            r2   forward_embeddingz(Kosmos2TextTransformer.forward_embedding  s       --i8M#AMQ^QeQeAfAkAkL%%b)BM.++%**+=> &(8(88 (('#9%	 ) 
	 LL!5!56	%	1--mt||VZVcVc-dr4   rK   r   image_embeds_position_maskrX  r{  	head_maskcross_attn_head_maskrS   r}  r   r  r  r[  r   rY   c                 "   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }||
t	        d      |"|j
                  }|j                  d|d         }n!|
|
j                         d d }nt	        d      | j                  r%| j                  r|rt        j                  d       d}d}|r<t        |	t              s,t        j                  d       d}t        j                  |	      }	|	|	j!                         nd}|dkD  rd }d }| j#                  ||
||||	      }| j%                  ||||      }||t'        ||
j(                  |d   
      }t*        j,                  j/                  || j.                  | j                        }|rdnd }|rdnd }|r|dnd }t1        ||gddg      D ]j  \  }}|	|j                         d   t3        | j4                        k7  s3t	        d| dt3        | j4                         d|j                         d    d       t7        | j4                        D ]|  \  }}|r||fz  }| j                  r%t9        j:                  g       }|| j<                  k  r? ||||f||||   nd |||   nd |	|||d|}|d   }|sh||d   fz  }|t||d   fz  }~ | j?                  |      }|r|	jA                         }	|r||fz  }tC        ||	|||      S )NzDYou cannot specify both input_ids and inputs_embeds at the same timer9   z5You have to specify either input_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...FzPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.Tr   )rK   r  rV   r  r7   r   r  r   rs   r  r  zThe `z` should be specified for z layers, but it is for .)r{  rZ  r|  rY  r   r}  r[  r   r   )rR   rS   rT   rU   cross_attentions)"r|   r   r  r}  r   r   r@   r&   r  r   r   r   r]  r   r   from_legacy_cacheget_seq_lengthr  r  r3   r"   r   r   r   ziplenr  r  r*   randr  r  to_legacy_cacher   )rd   rK   r   rV   r  rX  r{  r  r  rS   r  r   r}  r   r  r  r[  r   rM  return_legacy_cacher7   rT   all_hidden_statesall_self_attnsall_cross_attentions	attn_mask	mask_namer!  decoder_layerdropout_probabilityr#  s                                  r2   r   zKosmos2TextTransformer.forward  s   ( 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	 ]%>cdd"#//K!r;r?;I&',,.s3KTUU&&4==##p "	#Z?\
 #'1CCOTOETE`!?!?!Afg "A%L)-&..'%5#9% / 
 ==K8N

 !,1G1S%12H-J]J]grsugv%w"--mt||VZVcVc-d #7BD0d&7<Q<]rdh %(4H(IKYoKp$q 	 Iy$>>#A&3t{{+;<$	{*DSEUDV W%NN,Q/03 	 #,DKK"8 	@C#!m%55!}}&+jjn#&7)% (>3<3H3dI]Ii,@,Eos."3#- M *!,M =#3"55(4(]1-=,??(9	@> 6-==?O  -!118+++%1
 	
r4   )NNNr   NNNNNNNNNNNNNNNNN)rm   rn   ro   rp   r   r   r  r   r*   r   rG   r  listrq   r+   r   r   r   ri   r   r   r   r   s   @r2   r  r    s   ,0 ,('4 15/315&'/3!  -! u||,	!
 !.! !$! u||,!J -115/3=A8<9=,07;=A04/3$(,0/3&*15#L
ELL)L
 !.L
 u||,	L

 %-U\\$:L
  (5L
 !) 6L
 ELL)L
 'u||4L
 "$u'8'8"9:L
  -L
 u||,L
 D>L
 $D>L
 'tnL
  d^!L
" !.#L
$ -.%L
& 
u??	@'L
r4   r  c                   P    e Zd ZU eed<   dZddgZdZdZdZ	de
j                  fdZy)Kosmos2PreTrainedModelr|   Tr  rr  r   c                    t        | t              r| j                  j                  }n6t        | t        t
        f      r | j                  j                  j                  }t        | t        t        f      r| j                  j                  }n6t        | t        t
        f      r | j                  j                  j                  }t        |t              rt        j                  j                  |j                  d|j                   dz  z         t        j                  j                  |j"                  j$                  |j                  j&                  |z         t        j                  j                  |j(                  j$                  |j                  j&                  |z         nt        |t*              r|j                   dz  d|j                  j,                  z  dz  z  z  }|j                   dz  |z  }t        j                  j                  |j.                  j$                  |       t        j                  j                  |j0                  j$                  |       t        j                  j                  |j2                  j$                  |       t        j                  j                  |j4                  j$                  |       nt        |t6              r|j                  j8                  dz  d|j                  j,                  z  dz  z  z  }d|j                  j8                  z  dz  |z  }t        j                  j                  |j:                  j$                  |       t        j                  j                  |j<                  j$                  |       nt        |t>              rt        j                  j                  |j.                  j$                         t        j                  j                  |j0                  j$                  |       t        j                  j                  |j2                  j$                  |       t        j                  j                  |j4                  j$                  |       n3t        |t@              rlt        j                  j                  |j:                  j$                         t        j                  j                  |j<                  j$                  |       nt        |t              r7t        j                  j                  |jB                  j$                         npt        |tD              r`t        j                  j                  |jF                  j$                         t        j                  j                  |jH                         n t        |tJ              r|jL                  j$                  jN                  j                  d       |jL                  jP                  |jL                  j$                  jN                  |jL                  jP                     jS                          nct        |t        jT                        rI|j$                  jN                  jW                  d       |jX                  jN                  jS                          t        |t        jZ                        r2|jX                  %|jX                  jN                  jS                          yyy)zInitialize the weightsr   r   )meanstd)r  r   Nr%   ).r]  Kosmos2VisionModelr|   initializer_factorKosmos2ModelKosmos2ForConditionalGenerationvision_configKosmos2TextModelKosmos2TextForCausalLMinit_stdtext_configr{   r   initnormal_r   r   r   r   initializer_ranger   r   r  r   r   r   r   r   r   r   r   rR  rj  lm_headKosmos2ImageToTextProjectiondenselatent_queryr  r  datarL   zero_r	  fill_r   r   )rd   r   factorr  in_proj_stdout_proj_stdfc_stds          r2   _init_weightsz$Kosmos2PreTrainedModel._init_weights  s   d./[[33F|-LMN[[..AAFd-/EFG++&&C|-LMN++))22Cf56GGOOF22&BRBRTXBX[aBaObGGOOF2299v}}?^?^ag?gOhGGOOF55<<&--BaBadjBjOk 67!++T1q6==;Z;Z7Z_c6cdgmmK",,d2f<LGGOOFMM00kOBGGOOFMM00kOBGGOOFMM00kOBGGOOFOO22OE 01!==44d:FMMDcDc@chl?lmpvvK&--333<vEFGGOOFJJ--6O:GGOOFJJ--;O? 34GGOOFMM00cO:GGOOFMM00cO:GGOOFMM00cO:GGOOFOO22O</GGOOFJJ--3O7GGOOFJJ--3O7 67GGOOFNN11sO; <=GGOOFLL//SO9GGOOF//0 67&&++33#3F""..:##**//0C0C0O0OPVVX-MM$$S)KK""$fbii(V[[-DKK""$ .E(r4   N)rm   rn   ro   r   rr   supports_gradient_checkpointing_no_split_modules_supports_attention_backend_supports_flash_attn_supports_sdpar   Moduler  rs   r4   r2   r  r  ~  s;    &*#46HI"&N2%BII 2%r4   r  c                        e Zd ZU eed<   dZdef fdZdej                  fdZ	e
	 	 	 	 	 ddeej                     dee   dee   ded	ee   deeef   fd
       Z xZS )r  r|   r   c                 d    t         |   |       t        |      | _        | j	                          y r   )r   r   r%  model	post_initr   s     r2   r   zKosmos2VisionModel.__init__  s&     -f5
r4   rY   c                 B    | j                   j                  j                  S r   )r  r   r   rk   s    r2   get_input_embeddingsz'Kosmos2VisionModel.get_input_embeddings  s    zz$$444r4   r   r  r   r  c                 .    | j                  |||||      S )N)r   r   r  r   r  r  )rd   r   r   r  r   r  s         r2   r   zKosmos2VisionModel.forward  s)     zz%/!5%=#  
 	
r4   r.  )rm   rn   ro   r    rr   main_input_namer   r   r  r  r   r   r*   rq   r+   r   ri   r   r   r   r   s   @r2   r  r    s    $O2 5bii 5  59,0/3).&*
u001
 $D>
 'tn	

 #'
 d^
 
u00	1
 
r4   r  c            )       <    e Zd ZU eed<   def fdZdej                  fdZe	e
	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     d	eej                     d
eej                     deej                     deej                     deeej                        deej                     deej                     dee   dee   dee   dee   deej                     dee   deeef   f$d              Z xZS )r  r|   c                 d    t         |   |       t        |      | _        | j	                          y r   )r   r   r  r  r  r   s     r2   r   zKosmos2TextModel.__init__  s&     +F3
r4   rY   c                 .    | j                   j                  S r   r  r  rk   s    r2   r  z%Kosmos2TextModel.get_input_embeddings      zz&&&r4   rK   r   rV   r  rX  r{  r  r  rS   r  r   r}  r   r  r  r[  r   c                      | j                   di d|d|d|d|d|d|d|d|d	|	d
|
d|d|d|d|d|d||S )a  
        image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
        image_embeds_position_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to indicate the location in a sequence to insert the image features . Mask values selected in `[0,
            1]`:

            - 1 for places where to put the image features,
            - 0 for places that are not for image features (i.e. for text tokens).
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        rK   r   rV   r  rX  r{  r  r  rS   r  r   r}  r   r  r  r[  rs   r  )rd   rK   r   rV   r  rX  r{  r  r  rS   r  r   r}  r   r  r  r[  r   s                     r2   r   zKosmos2TextModel.forward  s    J tzz 

)
 &
 (B	

 #8
 $:
  
 "6
 ,
 (
 &
  
 0
 "6
 $
  *#
 	
r4   r  )rm   rn   ro   r   rr   r   r   r  r  r   r   r   r*   r   r  rq   r+   r   r   r   ri   r   r   r   r   s   @r2   r  r    s   0 'bii '  -115/3=A8<9=,07;=A04/3$(,0/3&*15#5
ELL)5
 !.5
 u||,	5

 %-U\\$:5
  (55
 !) 65
 ELL)5
 'u||45
 "$u'8'8"9:5
  -5
 u||,5
 D>5
 $D>5
 'tn5
  d^!5
" !.#5
$ -.%5
& 
u??	@'5
  5
r4   r  z
    The text model from KOSMOS-2 with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    c            +           e Zd ZU eed<   dgZdef fdZdej                  fdZ	dej                  fdZ
ee	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     d	eej                     d
eej                     deej                     deej                     deej                     deej                     deeej"                        deej                     deej                     deej$                     dee   dee   dee   dee   deej                     dee   deeef   f&d              Z	 	 	 	 	 	 	 d fd	Z xZS )r  r|   zlm_head.weightc                     t         |   |       t        |      | _        t	        j
                  |j                  |j                  d      | _        | j                          y )NF)in_featuresout_featuresr   )
r   r   r  r  r   r   r   r  r  r  r   s     r2   r   zKosmos2TextForCausalLM.__init__-  sI     +F3
yyV-=-=FL]L]dij 	r4   rY   c                 .    | j                   j                  S r   r  rk   s    r2   r  z+Kosmos2TextForCausalLM.get_input_embeddings6  r  r4   c                     | j                   S r   )r  rk   s    r2   get_output_embeddingsz,Kosmos2TextForCausalLM.get_output_embeddings9  s    ||r4   rK   r   rV   r  rX  r{  r  r  rS   r  r   labelsr}  r   r  r  r[  r   c                    ||n| j                   j                  }||rt        j                  d       d} | j                  di d|d|d|d|d|d	|d
|d|d|	d|
d|d|d|d|ddd||}| j                  |d         }d}|* | j                  d||| j                   j                  d|}t        |||j                  |j                  |j                  |j                        S )aK  
        image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
        image_embeds_position_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to indicate the location in a sequence to insert the image features . Mask values selected in `[0,
            1]`:

            - 1 for places where to put the image features,
            - 0 for places that are not for image features (i.e. for text tokens).
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        NzJThe `use_cache` argument is changed to `False` since `labels` is provided.FrK   r   rV   r  rX  r{  r  r  rS   r  r   r}  r   r  r  Tr[  r   )rw   r  r  )rv   rw   rS   rT   rU   r  rs   )r|   r  r   warningr  r  loss_functionr  r   rS   rT   rU   r  )rd   rK   r   rV   r  rX  r{  r  r  rS   r  r   r  r}  r   r  r  r[  r   r  	lm_logitsrv   s                         r2   r   zKosmos2TextForCausalLM.forward<  sh   T &1%<k$++B]B]klI$** 

)
 &
 (B	

 #8
 $:
  
 "6
 ,
 (
 &
  
 0
 "6
 
  *#
& LL,	%4%%sYvRVR]R]RhRhslrsD0#33!//))$55
 	
r4   c	                    |d   dk7  rd }d }n|||j                         d d n|j                         \  }
}|j                         d   }t        j                  |t        j                  |
||z
  ft        j                  |j
                        fd      }t        |   |f|||||||d|	}|j                  dd        |S )Nr   r9   )r&   r"   r6   r   r;   )rS   r   rV   r  r  r}  r[  r   )	r&   r*   rA   rB   r+   r6   r   prepare_inputs_for_generationpop)rd   rK   rV   r  rS   r   r  r}  r[  model_kwargsr   rJ  mask_lenmodel_inputsr   s                 r2   r  z4Kosmos2TextForCausalLM.prepare_inputs_for_generation  s     !!L)-& (3?L?X-"4"4"6s";^g^l^l^nJ1668<H)..KKj'H2D%EUZZ`i`p`pq *& w<

+)%'A')

 

 	.r4   )NNNNNNNNNNNNNNNNN)NNNNNNN)rm   rn   ro   r   rr   _tied_weights_keysr   r   r  r  r  r   r   r   r*   r   r  rq   
LongTensorr+   r   r   r   ri   r   r   r  r   r   s   @r2   r  r  #  s$    *+0 'bii 'ryy   -115/3=A8<9=,07;=A04/3-1$(,0/3&*15%O
ELL)O
 !.O
 u||,	O

 %-U\\$:O
  (5O
 !) 6O
 ELL)O
 'u||4O
 "$u'8'8"9:O
  -O
 u||,O
 ))*O
 D>O
 $D>O
  'tn!O
" d^#O
$ !.%O
& +,'O
( 
u77	8)O
  O
h #'- -r4   r  c                   .     e Zd ZdZdef fdZd Z xZS )r  zmThe layer that transforms the image model's output to part of the text model's input (namely, image features)r|   c                    t         |           t        j                  |j                  j
                  |j                  j                        | _        t        j                  t        j                  |j                  |j                  j                              | _        t        |j                  |j                  j                  |j                  j                  |j                  j                   dd      | _        y )NF)r   rS  rT  )r   r   r   r   r  r   r  r   r  r   r*   r   latent_query_numr  rR  rt  r   x_attnr   s     r2   r   z%Kosmos2ImageToTextProjection.__init__  s    YYv33??ASASA]A]^
LLV5L5LfN`N`NjNj)kl)((..&&88%*
r4   c                    | j                  |      }| j                  j                  d      j                  |j	                  d      dd      }t        j                  ||gd      }| j                  ||d d d       \  }}||fS )Nr   r9   r   r;   )rT   rX  rY  r   r   )r  r  r   r'   r&   r*   rA   r  )rd   featuresrT   r  key_value_statesr   s         r2   r   z$Kosmos2ImageToTextProjection.forward  s    

8, ((2215<<]=O=OPQ=RTVXZ[ 99m\%BJ&*kk&"2" '2 '
#| l**r4   )rm   rn   ro   rp   r   r   r   r   r   s   @r2   r  r    s    w
} 
+r4   r  z}
    KOSMOS-2 Model for generating text and image features. The model consists of a vision encoder and a language model.
    c            %       &    e Zd ZU eed<   dZdef fdZdej                  fdZ	d Z
	 	 ddej                  dee   dee   fd	Zee	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej$                     d
eej$                     deej$                     deej$                     deej$                     deeej                        deej$                     deej$                     deej$                     dee   dee   dee   dedee   dee   deeef   f d              Z xZS )r  r|   r   c                     t         |   |       t        |j                        | _        t        |j                        | _        t        |      | _	        | j                          y r   )r   r   r  r  
text_modelr  r  vision_modelr  image_to_text_projectionr  r   s     r2   r   zKosmos2Model.__init__  sN     *6+=+=>.v/C/CD(DV(L% 	r4   rY   c                 B    | j                   j                  j                  S r   r  r  r  rk   s    r2   r  z!Kosmos2Model.get_input_embeddings      $$111r4   c                 :    || j                   j                  _        y r   r  rd   r   s     r2   set_input_embeddingsz!Kosmos2Model.set_input_embeddings      -2*r4   return_attentionsr   c                     | j                  ||      }| j                   j                  j                  |d         }t        j                  j                  |d      }| j                  |      \  }}|r||fS |S )aD  
        Encodes images into continuous embeddings that can be forwarded to the language model.

        Args:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
                The tensors corresponding to the input images.
            return_attentions (`bool`, *optional*, defaults to `False`):
                Whether to return `projection_attentions` or not.
            interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
                Whether to interpolate positional embeddings or not.
        )r   r   r   r9   r;   )r  r  r)  r   r   	normalizer  )rd   r   r  r   rX   rV   rW   s          r2   get_image_featureszKosmos2Model.get_image_features  s    " #//%%= 0 

 ((..==>QRS>TU}}..|.D.2.K.KL.Y++!666r4   rK   r  r   r  rS   rV   r  r   r}  r   r  r  r   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }d}d}|$|t	        d      | j                  |d|      \  }} | j                  d||||||||	|
||dd|}t        |j                  |j                  |j                  |j                  |||      S )aE  
        image_embeds_position_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to indicate the location in a sequence to insert the image features . Mask values selected in `[0,
            1]`:

            - 1 for places where to put the image features,
            - 0 for places that are not for image features (i.e. for text tokens).
        image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Kosmos2Model

        >>> model = Kosmos2Model.from_pretrained("microsoft/kosmos-2-patch14-224")
        >>> processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224")

        >>> url = "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> text = (
        ...     "<grounding> An image of<phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863>"
        ...     "</object> warming himself by<phrase> a fire</phrase><object><patch_index_0005><patch_index_0911>"
        ...     "</object>"
        ... )

        >>> inputs = processor(text=text, images=image, return_tensors="pt", add_eos_token=True)

        >>> last_hidden_state = model(
        ...     pixel_values=inputs["pixel_values"],
        ...     input_ids=inputs["input_ids"],
        ...     attention_mask=inputs["attention_mask"],
        ...     image_embeds_position_mask=inputs["image_embeds_position_mask"],
        ... ).last_hidden_state
        >>> list(last_hidden_state.shape)
        [1, 91, 2048]
        ```N<You have to specify either `pixel_values` or `image_embeds`.T)r  r   )rK   r   rV   r  r  rS   r  r   r}  r   r  r  )rR   rS   rT   rU   rV   rW   rX   rs   )r|   r   r  r  r   r
  r  rQ   rR   rS   rT   rU   )rd   r   rK   r  r   r  rS   rV   r  r   r}  r   r  r   r  r   rX   rW   r  s                      r2   r   zKosmos2Model.forward  s   x 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]" $# !_``262I2IOg 3J 3/L/ "$// 
)%'A+'%/!5
 
  "%77#33!//))%"7 3
 	
r4   )FF)NNNNNNNNNNNNFN)rm   rn   ro   r   rr   r  r   r   r  r  r  r*   rq   r   r+   r
  r   r   r   r  r   r   r   ri   rQ   r   r   r   s   @r2   r  r    s    $O} 2bii 23 -238	'' $D> #+4.	>  04,0=A15,0=A/304/3$(,0/3).&*a
u||,a
 ELL)a
 %-U\\$:	a

 !.a
 ELL)a
 "$u'8'8"9:a
 u||,a
  -a
 u||,a
 D>a
 $D>a
 'tna
 #'a
 d^a
  -.!a
" 
u((	)#a
  a
r4   r  z
    KOSMOS-2 Model for generating text and bounding boxes given an image. The model consists of a vision encoder and a
    language model.
    c            #           e Zd ZU eed<   dZdgZdef fdZdej                  fdZ
d Zdej                  fdZd	 Zee	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej$                     d
eej$                     deej$                     deej$                     deej$                     deeej(                        deej$                     deej$                     deej$                     deej*                     dee   dee   dee   dee   deeef   fd              Z	 	 	 	 	 	 ddeej$                     deej$                     d
eej$                     deej$                     deej$                     deej$                     fdZ xZS )r  r|   r   ztext_model.lm_head.weightc                     t         |   |       t        |j                        | _        t        |j                        | _        t        |      | _	        | j                          y r   )r   r   r  r  r  r  r  r  r  r  r  r   s     r2   r   z(Kosmos2ForConditionalGeneration.__init__  sN     01C1CD.v/C/CD(DV(L% 	r4   rY   c                 B    | j                   j                  j                  S r   r  rk   s    r2   r  z4Kosmos2ForConditionalGeneration.get_input_embeddings  r  r4   c                 :    || j                   j                  _        y r   r  r  s     r2   r  z4Kosmos2ForConditionalGeneration.set_input_embeddings  r  r4   c                 6    | j                   j                         S r   )r  r  rk   s    r2   r  z5Kosmos2ForConditionalGeneration.get_output_embeddings  s    4466r4   c                 :    | j                   j                  |       y r   )r  set_output_embeddings)rd   new_embeddingss     r2   r  z5Kosmos2ForConditionalGeneration.set_output_embeddings  s    --n=r4   rK   r  r   r  rS   rV   r  r   r  r}  r   r  r   c                 <   ||n| j                   j                  }||n| j                   j                  }d}d}|~|t        d      | j	                  |||      }| j                  j
                  j                  |d         }t        j                  j                  |d      }| j                  |      \  }} | j                  d
||||||||	|
|||dd|}t        |j                  |j                  |j                  |j                   |j"                  |||	      S )a5  
        image_embeds_position_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to indicate the location in a sequence to insert the image features . Mask values selected in `[0,
            1]`:

            - 1 for places where to put the image features,
            - 0 for places that are not for image features (i.e. for text tokens).
        image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Kosmos2ForConditionalGeneration

        >>> model = Kosmos2ForConditionalGeneration.from_pretrained("microsoft/kosmos-2-patch14-224")
        >>> processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224")

        >>> url = "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> prompt = "<grounding> An image of"

        >>> inputs = processor(text=prompt, images=image, return_tensors="pt")

        >>> generated_ids = model.generate(
        ...     pixel_values=inputs["pixel_values"],
        ...     input_ids=inputs["input_ids"],
        ...     attention_mask=inputs["attention_mask"],
        ...     image_embeds=None,
        ...     image_embeds_position_mask=inputs["image_embeds_position_mask"],
        ...     use_cache=True,
        ...     max_new_tokens=64,
        ... )
        >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        >>> processed_text = processor.post_process_generation(generated_text, cleanup_and_extract=False)
        >>> processed_text
        '<grounding> An image of<phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863></object> warming himself by<phrase> a fire</phrase><object><patch_index_0005><patch_index_0911></object>.'

        >>> caption, entities = processor.post_process_generation(generated_text)
        >>> caption
        'An image of a snowman warming himself by a fire.'

        >>> entities
        [('a snowman', (12, 21), [(0.390625, 0.046875, 0.984375, 0.828125)]), ('a fire', (41, 47), [(0.171875, 0.015625, 0.484375, 0.890625)])]
        ```Nr  )r   r   r  r   r9   r;   T)rK   r   rV   r  r  rS   r  r   r  r}  r   r  r  )rv   rw   rS   rT   rU   rV   rW   rX   rs   )r|   r   r  r   r  r  r)  r   r   r	  r  r  ru   rv   rw   rS   rT   rU   )rd   r   rK   r  r   r  rS   rV   r  r   r  r}  r   r  r   rX   rW   
lm_outputss                     r2   r   z'Kosmos2ForConditionalGeneration.forward  s^   N 2C1N-TXT_T_TqTq$8$D $++JjJj 	 # $# !_``"&"3"3)"3%9 #4 #  ,,22AABUVWBXYL==22<R2HL262O2OP\2]/L/$T__ 
)%'A+'%/!5
 

" :$$&66$22!,,%"7 3	
 		
r4   c           	         |j                  dd       }||t        d| d      |||}|n| j                  |      }	| j                  j                  j	                  |	d         }t
        j                  j                  |d      }| j                  |      \  }}
 | j                  j                  d|||||d|}|S )	Ninputsz
`inputs`: zp were passed alongside `pixel_values` which is not allowed.Make sure to either pass `inputs` or pixel_values=...r   r9   r;   )rK   r   rV   r  r  rs   )r  r   r  r  r)  r   r   r	  r  r  generate)rd   r   r  rK   r   rV   r  r   r  rX   rW   outputs               r2   r  z(Kosmos2ForConditionalGeneration.generate  s     Hd+#(:VH %H I  F$6!L"&"3"3L"A,,22AABUVWBXYL==22<R2HL262O2OP\2]/L/))) 
)%'A'
 
 r4   )NNNNNNNNNNNNN)NNNNNN)rm   rn   ro   r   rr   r  r  r   r   r  r  r  r  r  r   r   r   r*   r   r  rq   r  r+   r   r   r   ri   ru   r   r  r   r   s   @r2   r  r    sC    $O56	} 	2bii 237ryy 7>  04,0=A15,0=A/304/3-1$(,0/3u
u||,u
 ELL)u
 %-U\\$:	u

 !.u
 ELL)u
 "$u'8'8"9:u
 u||,u
  -u
 u||,u
 ))*u
 D>u
 $D>u
 'tnu
 +,u
  
u@@	A!u
  u
r 04=A,015/304%u||,% %-U\\$:% ELL)	%
 !.% u||,%  -%r4   r  )r  r  r  r   )r   )r   )Orp   r<  dataclassesr   typingr   r   r   r   r*   torch.utils.checkpointr   activationsr
   cache_utilsr   r   
generationr   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   r   configuration_kosmos2r   r   r    
get_loggerrm   r   r   r"   rG   r3   Sizer6   rD   rN   rQ   ru   r  r{   r@  r   r   r   r  r  r%  r0  rR  rj  rr  r  r  r  r  r  r  r  r  __all__rs   r4   r2   <module>r+     s&     ! 1 1    ! 5 ) B 9  G & j j X X 
		H	%[u|| [EKK [(3- [ jk\ZZ\(-\=B\\\cf\$4  
#
 #
 #
L 
(
 (
 (
XPbii Pv %II%<<% 
% <<	%
 U\\*% % %,L)RYY L)`ryy  / : /fT
299 T
p3
ryy 3
nUcryy Ucpz)")) z)zRYY .c1 cLb
RYY b
J :%_ :% :%z
/ 
BC
- C
L S3_ SSl +299  +F 
V
) V

V
r z&<o zzz Xr4   