
    rhk                     `   d dl Zd dlmZ d dlmZmZmZ d dlZd dl	m
Z
 ddlmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZ ddlmZmZ ddlmZ ddl m!Z!m"Z"m#Z#m$Z$m%Z% ddl&m'Z' ddl(m)Z)m*Z*  ed       G d de
jV                               Z,	 dDde
jV                  dejZ                  dejZ                  dejZ                  deejZ                     de.de.fdZ/ G d de
jV                        Z0e# G d  d!e             Z1e e#d"#       G d$ d%e                    Z2 G d& d'e
jV                        Z3 G d( d)e
jV                        Z4 G d* d+e
jV                        Z5e
jl                  e,d,Z7 G d- d.e      Z8 G d/ d0e
jV                        Z9e# G d1 d2e1             Z:e# G d3 d4e             Z; G d5 d6e
jV                        Z<e e#d7#       G d8 d9e                    Z= e#d:#       G d; d<e;             Z>e e#d=#       G d> d?e!                    Z? e#d@#       G dA dBe;e             Z@g dCZAy)E    N)	dataclass)CallableOptionalUnion   )ACT2FN)Cache)GenerationMixin)use_kernel_forward_from_hub)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPastBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tuple	torch_int   )	AutoModel   )InternVLConfigInternVLVisionConfigRMSNormc                   ,     e Zd Zd fd	Zd Zd Z xZS )InternVLVisionRMSNormc                     t         |           t        j                  t	        j
                  |            | _        || _        y)zD
        InternVLVisionRMSNorm is equivalent to T5LayerNorm
        N)super__init__nn	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__s      /var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/internvl/modeling_internvl.pyr#   zInternVLVisionRMSNorm.__init__.   s1     	ll5::k#:; #    c                 "   |j                   }|j                  t        j                        }|j	                  d      j                  dd      }|t        j                  || j                  z         z  }| j                  |j                  |      z  S )Nr   T)keepdim)	dtypetor&   float32powmeanrsqrtr)   r(   )r*   hidden_statesinput_dtypevariances       r.   forwardzInternVLVisionRMSNorm.forward6   sy    #))%((7 $$Q',,R,>%Ht?T?T4T(UU{{]--k:::r/   c                 ^    t        | j                  j                         d| j                   S )Nz, eps=)tupler(   shaper)   r*   s    r.   
extra_reprz InternVLVisionRMSNorm.extra_repr=   s*    ))*+6$2G2G1HIIr/   )gư>)__name__
__module____qualname__r#   r<   rA   __classcell__r-   s   @r.   r    r    ,   s    $;Jr/   r    modulequerykeyvalueattention_maskscalingdropoutc                    |}|}	t        j                  ||j                  dd            |z  }
|#|d d d d d d d |j                  d   f   }|
|z   }
t        j
                  j                  |
d      }
t        j
                  j                  |
|| j                        }
t        j                  |
|	      }|j                  dd      j                         }||
fS )Nr   r   r1   dim)ptrainingr   )
r&   matmul	transposer?   r$   
functionalsoftmaxrM   rS   
contiguous)rG   rH   rI   rJ   rK   rL   rM   kwargs
key_statesvalue_statesattn_weightscausal_maskattn_outputs                r.   eager_attention_forwardr_   A   s     JL<<z';';Aq'ABWLL!$Q1.D
0@0@0D.D%DE#k1 ==((2(>L==((6??([L,,|\:K''1-88:K$$r/   c            
            e Zd ZdZdef fdZ	 	 d	dej                  deej                     deej                     de	e
   fdZ xZS )
InternVLVisionAttentionz+Attention Class for InternVL Vision Encoderconfigc                    t         |           || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        | j                  | j                  z  | j                  k7  r&t        d| j                   d| j                   d      | j                  dz  | _	        |j                  | _
        |j                  }|j                  }d| _        t        j                  | j                  | j                  | j                  z  |j                         | _        t        j                  | j                  | j                  | j                  z  |j                         | _        t        j                  | j                  | j                  | j                  z  |j                         | _        t        j                  | j                  | j                        | _        |dkD  rt        j*                  |      nt        j,                         | _        |rt/        | j                        nt        j,                         | _        |rt/        | j                        | _        y t        j,                         | _        y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      Fbiasr   )r"   r#   rb   r+   	embed_dimnum_attention_heads	num_headshead_dim
ValueErrorscaleattention_dropoutprojection_dropoutuse_qk_norm	is_causalr$   Linearattention_biasq_projk_projv_projprojection_layerDropoutIdentityr    q_normk_norm)r*   rb   proj_dropoutqk_normr-   s       r.   r#   z InternVLVisionAttention.__init___   s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
!'!9!900$$ ii0NU[UjUjkii0NU[UjUjkii0NU[UjUjk "		$..$.. I>JQ>N"**\":TVT_T_Ta?F+DNN;BKKM?F+DNN;BKKMr/   r9   rK   output_attentionsrY   c                    |j                         \  }}}| j                  |      }| j                  |      }	| j                  |      }
| j	                  |      }| j                  |	      }	|j                  ||| j                  | j                        j                  dd      }|	j                  ||| j                  | j                        j                  dd      }	|
j                  ||| j                  | j                        j                  dd      }
t        }| j                  j                  dk7  rt        | j                  j                     } || ||	|
|f| j                  sdn| j                   | j"                  dd|\  }}|j                  ||| j$                        }| j'                  |      }| j)                  |      }|r||f}|S |d f}|S )Nr   r   eager        F)rM   rL   ro   )sizerr   rs   rt   rx   ry   reshaperh   ri   rU   viewr_   rb   _attn_implementationr   rS   rl   rk   rf   ru   rm   )r*   r9   rK   r|   rY   
batch_sizeseq_len_query_statesrZ   r[   attention_interfacer^   r\   outputoutputss                   r.   r<   zInternVLVisionAttention.forward{   s    "/!3!3!5
GQ{{=1[[/
{{=1{{<0[[,
#++JQUQ^Q^_iijkmno''
GT^^T]][eefgijk
#((Wdnndmm\ffghjkl(?;;++w6"9$++:Z:Z"[$7
%
  $}}C$2H2HJJ
%
 
%
!\ "))*gt~~N&&{3((0,=6<( EKD>r/   NN)rB   rC   rD   __doc__r   r#   r&   Tensorr   r   r   r<   rE   rF   s   @r.   ra   ra   \   sd    5Z3 Z> 2648	)||) !.) $ELL1	)
 -.)r/   ra   c                   L     e Zd ZU eed<   dZdZdZdgZdZ	dZ
dZdZ fdZ xZS )InternVLVisionPreTrainedModelrb   internvl_visionpixel_valuesTInternVLVisionLayerc                 V   t         |   |       t        |t              r|j                  j
                  j                          |j                  $|j                  j
                  j                          |j                  %|j                  j
                  j                          yyt        |t              rs|j                  j
                  j                  | j                  j                         |j                  j
                  j                  | j                  j                         yy)zInitialize the weightsN)r"   _init_weights
isinstanceInternVLVisionEmbeddings	cls_tokendatazero_
mask_tokenposition_embeddingsr   lambda_1fill_rb   layer_scale_init_valuelambda_2)r*   rG   r-   s     r.   r   z+InternVLVisionPreTrainedModel._init_weights   s    f%f67!!'')  ,!!&&,,.))5**//557 6 34OO  &&t{{'I'IJOO  &&t{{'I'IJ 5r/   )rB   rC   rD   r   __annotations__base_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backendr   rE   rF   s   @r.   r   r      sF      )$O&*#./N"&K Kr/   r   z7
    Class for outputs of [`InternVLVisionModel`].
    )custom_introc                       e Zd ZdZy)$InternVLVisionModelOutputWithPoolingaF  
    pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
        Average of the last layer hidden states of the patch tokens (excluding the *[CLS]* token) if
        *config.use_mean_pooling* is set to True. If set to False, then the final hidden state of the *[CLS]* token
        will be returned.
    N)rB   rC   rD   r    r/   r.   r   r      s    r/   r   c                   Z     e Zd ZdZ fdZdej                  dej                  fdZ xZS )InternVLVisionPatchEmbeddingsz
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                 ^   t         |           |j                  |j                  }}|j                  |j
                  }}|d   |d   z  |d   |d   z  z  }|d   |d   z  |d   |d   z  f}|| _        || _        || _        || _        || _        t        j                  ||||      | _
        y )Nr   r   )kernel_sizestride)r"   r#   
image_size
patch_sizenum_channelsr+   num_patchespatch_shaper$   Conv2d
projection)	r*   rb   r   r   r   r+   r   r   r-   s	           r.   r#   z&InternVLVisionPatchEmbeddings.__init__   s    !'!2!2F4E4EJ
$*$7$79K9Kk!!}
15*Q-:VW=:XY!!}
15z!}
ST7UV$$(&&))L+:^hir/   r   returnc                    |j                   \  }}}}|| j                  k7  rt        d      | j                  |      }|j                   d   |j                   d   }}|j	                  d      j                  dd      }|||ffS )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r   r   r   )r?   r   rj   r   flattenrU   )	r*   r   r   r   heightwidth
embeddingspatch_heightpatch_widths	            r.   r<   z%InternVLVisionPatchEmbeddings.forward   s    2>2D2D/
L&%4,,,w  __\2
$.$4$4Q$79I9I!9Lk''*44Q:
L+666r/   )	rB   rC   rD   r   r#   r&   r   r<   rE   rF   s   @r.   r   r      s)    j7ELL 7U\\ 7r/   r   c                        e Zd ZdZdeddf fdZdej                  dededej                  fd	Z		 dd
ej                  de
ej                     dej                  fdZ xZS )r   zc
    Construct the CLS token, position and patch embeddings. Optionally, also the mask token.

    rb   r   Nc                 2   t         |           t        j                  t	        j
                  dd|j                              | _        |j                  r:t        j                  t	        j
                  dd|j                              | _	        nd | _	        t        |      | _        |j                  | _        t        |j                  t        j                   j"                        r|j                  n|j                  |j                  f| _        | j                  j$                  }|j&                  r=t        j                  t	        j
                  d|dz   |j                              | _        nd | _        t        j*                  |j,                        | _        y )Nr   )r"   r#   r$   r%   r&   zerosr+   r   use_mask_tokenr   r   patch_embeddingsr   r   r   collectionsabcIterabler    use_absolute_position_embeddingsr   rv   hidden_dropout_probrM   )r*   rb   r   r-   s      r.   r#   z!InternVLVisionEmbeddings.__init__   s$   ekk!Q8J8J&KL   ll5;;q!V=O=O+PQDO"DO =f E ++ &++[__-E-EF ##V%6%67 	
 ++7722')||EKK;QR?TZTfTf4g'hD$'+D$zz&"<"<=r/   r   r   r   c                    |j                   d   dz
  }| j                  j                   d   dz
  }t        j                  j	                         s||k(  r||k(  r| j                  S | j                  ddddf   }| j                  ddddf   }|j                   d   }|| j
                  d   z  }	|| j
                  d   z  }
t        |dz        }|j                  d|||      }|j                  dddd      }t        j                  j                  ||	|
fdd	
      }|j                  dddd      j                  dd|      }t        j                  ||fd      S )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   Nr1   r         ?r   r   bicubicF)r   modealign_cornersrP   )r?   r   r&   jit
is_tracingr   r   r   permuter$   rV   interpolater   cat)r*   r   r   r   r   num_positionsclass_pos_embedpatch_pos_embedrQ   
new_height	new_widthsqrt_num_positionss               r.   interpolate_pos_encodingz1InternVLVisionEmbeddings.interpolate_pos_encoding  sj    !&&q)A-0066q9A= yy##%+*F6UZ?+++221bqb59221ab59r"tq11
T__Q//	&}c'9:)11!5GI[]`a)11!Q1=--33i(	 4 
 *11!Q1=BB1b#Nyy/?;CCr/   r   bool_masked_posc                    |j                   \  }}}}| j                  |      \  }\  }}|j                         \  }	}
}|K| j                  j	                  |	|
d      }|j                  d      j                  |      }|d|z
  z  ||z  z   }| j                  j	                  |	dd      }t        j                  ||fd      }| j                  || j                  |||      z   }| j                  |      }|||ffS )Nr1   r   rP   )r?   r   r   r   expand	unsqueezetype_asr   r&   r   r   r   rM   )r*   r   r   r   r   r   r   r   r   r   r   mask_tokensw
cls_tokenss                 r.   r<   z InternVLVisionEmbeddings.forward:  s   
 +001fe262G2G2U/
/\;!+!2
GQ&//00WbIK))"-55kBA#q1u-a?J^^**:r2>
YY
J7Q?
##/#d&C&CJPVX]&^^J\\*-
L+666r/   N)rB   rC   rD   r   r   r#   r&   r   intr   r   
BoolTensorr<   rE   rF   s   @r.   r   r      s    
>3 > >,&D5<< &D &DUX &D]b]i]i &DV 7;7ll7 "%"2"237 
	7r/   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )InternVLVisionMLPc                    t         |           || _        t        |j                     | _        t        j                  |j                  |j                        | _
        t        j                  |j                  |j                        | _        y r   )r"   r#   rb   r   
hidden_actactivation_fnr$   rp   r+   intermediate_sizefc1fc2r*   rb   r-   s     r.   r#   zInternVLVisionMLP.__init__U  sd    #F$5$5699V//1I1IJ99V55v7I7IJr/   r9   r   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r   r   r   )r*   r9   s     r.   r<   zInternVLVisionMLP.forward\  s4    /**=9/r/   )rB   rC   rD   r#   r&   r   r<   rE   rF   s   @r.   r   r   T  s$    KU\\ ell r/   r   )
layer_normrms_normc                        e Zd ZdZdeddf fdZ	 d	dej                  dede	e
ej                     e
ej                  ej                  f   f   fdZ xZS )
r   z?This corresponds to the Block class in the timm implementation.rb   r   Nc                    t         |           |j                  | _        d| _        t	        |      | _        t        |      | _        t        |j                     |j                  |j                        | _        t        |j                     |j                  |j                        | _        |j                  }t        j                   |t#        j$                  |j                        z  d      | _        t        j                   |t#        j$                  |j                        z  d      | _        t        j*                  |j,                        | _        y )Nr   r,   T)requires_grad)r"   r#   chunk_size_feed_forwardseq_len_dimra   	attentionr   mlpNORM2FN	norm_typer+   layer_norm_epslayernorm_beforelayernorm_afterr   r$   r%   r&   r'   r   r   rv   r   rM   )r*   rb   init_valuesr-   s      r.   r#   zInternVLVisionLayer.__init__i  s    '-'E'E$08$V, '(8(8 9&:L:LRXRgRg h&v'7'789K9KQWQfQfg33[5::f>P>P3Q%Qaef[5::f>P>P3Q%Qaefzz&"<"<=r/   r9   r|   c                 "   | j                  | j                  |      |      \  }}| j                  |z  }||z   }| j                  |      }| j	                  |      }| j                  |      }| j                  | j                  |z  }||z   }||fS )N)r|   )r   r   r   r   r   rM   r   )r*   r9   r|   attention_outputattention_weightslayer_outputs         r.   r<   zInternVLVisionLayer.forwardx  s    
 /3nn!!-0/ /= /
++
  ==+;; )=8 ++M:xx-||L1==$==<7L $m3...r/   )F)rB   rC   rD   r   r   r#   r&   r   boolr   r>   r<   rE   rF   s   @r.   r   r   f  si    I>3 > >$ #(/||/  / 
uU\\"E%,,*D$EE	F	/r/   r   c                   n     e Zd Zdeddf fdZe	 	 d	dej                  dedede	e
ef   fd       Z xZS )
InternVLVisionEncoderrb   r   Nc                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w )NF)
r"   r#   rb   r$   
ModuleListrangenum_hidden_layersr   layergradient_checkpointing)r*   rb   ir-   s      r.   r#   zInternVLVisionEncoder.__init__  sO    ]]vOgOgIh#iA$7$?#ij
&+# $js   A#r9   r|   output_hidden_statesc                     |rdnd }|rdnd }t        | j                        D ]'  \  }}|r||fz   } |||      }|d   }|s||d   fz   }) |r||fz   }t        |||      S )Nr   r   r   last_hidden_stater9   
attentions)	enumerater  r   )	r*   r9   r|   r  all_hidden_statesall_self_attentionsr  layer_modulelayer_outputss	            r.   r<   zInternVLVisionEncoder.forward  s     #7BD$5b4(4 		POA|#$58H$H!(8IJM)!,M &9]1=M<O&O#		P   1]4D D++*
 	
r/   )FF)rB   rC   rD   r   r#   r   r&   r   r  r   r>   r   r<   rE   rF   s   @r.   r  r    sg    ,3 , ,  #(%*	
||
  
 #	

 
uo%	&
 
r/   r  c                        e Zd Zdeddf fdZd Zee	 	 	 ddej                  de
ej                     de
e   d	e
e   deeef   f
d
              Z xZS )InternVLVisionModelrb   r   Nc                 2   t         |   |       || _        t        |      | _        t        |      | _        |j                  rt        j                         n*t        j                  |j                  |j                        | _        | j                          y )Nr   )r"   r#   rb   r   r   r  encoderuse_mean_poolingr$   rw   	LayerNormr+   r   	layernorm	post_initr   s     r.   r#   zInternVLVisionModel.__init__  so     26:,V4 $44BKKM",,vGYGY_e_t_t:u 	
 	r/   c                 .    | j                   j                  S r   )r   r   r@   s    r.   get_input_embeddingsz(InternVLVisionModel.get_input_embeddings  s    ///r/   r   r   r|   r  c                 .   ||n| j                   j                  }||n| j                   j                  }| j                  ||      \  }}| j	                  |||      }|d   }| j                  |      }t        ||j                  |j                        S )z
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        )r   )r|   r  r   r  )	rb   r|   r  r   r  r  r   r9   r  )	r*   r   r   r|   r  embedding_outputr   encoder_outputssequence_outputs	            r.   r<   zInternVLVisionModel.forward  s     2C1N-TXT_T_TqTq$8$D $++JjJj 	 #oolOo\!,,/!5 ' 

 *!,..93-)77&11
 	
r/   )NNN)rB   rC   rD   r   r#   r"  r   r   r&   r   r   r   r  r   r>   r   r<   rE   rF   s   @r.   r  r    s    3  0  7;,0/3
ll
 "%"2"23
 $D>	

 'tn
 
u::	;
  
r/   r  c                   8    e Zd ZU eed<   dZdZdZdZdZ	dZ
dZdZy)InternVLPreTrainedModelrb    Tpast_key_valuesN)rB   rC   rD   r   r   r   r   _skip_keys_device_placementr   r   _can_compile_fullgraphr   r   r   r/   r.   r(  r(    s7    &*#"3N!"&r/   r(  c                   *     e Zd Zdef fdZd Z xZS )InternVLMultiModalProjectorrb   c                 *   t         |           t        j                  |j                  j
                  t        d|j                  z        dz  z        | _        t        j                  |j                  j
                  t        d|j                  z        dz  z  |j                  j
                        | _        t        |j                     | _        t        j                  |j                  j
                  |j                  j
                        | _        y )Nr   r   )r"   r#   r$   r  vision_configr+   r   downsample_ratior   rp   text_configlinear_1r   projector_hidden_actactlinear_2r   s     r.   r#   z$InternVLMultiModalProjector.__init__  s    ,,v';';'G'G#aRXRiRiNiJjnoJo'op		  ,,s1v7N7N3N/OST/TTV\VhVhVtVt
 &556		&"4"4"@"@&BTBTB`B`ar/   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }|S r   )r   r3  r5  r6  )r*   image_featuresr9   s      r.   r<   z#InternVLMultiModalProjector.forward  s@    7m4/m4r/   )rB   rC   rD   r   r#   r<   rE   rF   s   @r.   r.  r.    s    b~ br/   r.  zM
    Base class for InternVL outputs, with hidden states and attentions.
    c                   :    e Zd ZU dZdZeej                     ed<   y)InternVLModelOutputWithPasta  
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nimage_hidden_states)	rB   rC   rD   r   r;  r   r&   FloatTensorr   r   r/   r.   r:  r:    s    
 8<%"3"34;r/   r:  zx
    The InternVL model which consists of a vision backbone and a language model, without a language modeling head.
    c            #       R    e Zd ZddiZdef fdZd Zd Zd Zd Z		 	 dd	e
j                  d
eeeee   f      dee   fdZde
j$                  de
j                  de
j                  fdZee	 	 	 	 	 	 	 	 	 	 	 	 	 d de
j$                  d	e
j                  dee
j,                     dee
j$                     dee   dee
j                     d
eeeee   f      dee   dee   dee   dee   dee   dee
j$                     dee   deeef   fd              Zd!de
j,                  defdZ xZ S )"InternVLModelzlanguage_model.modellanguage_modelrb   c                     t         |   |       t        j                  |j                        | _        t        |      | _        t        j                  |j                        | _	        | j                          y r   )r"   r#   r   from_configr0  vision_towerr.  multi_modal_projectorr2  r?  r   r   s     r.   r#   zInternVLModel.__init__1  sY     %11&2F2FG%@%H"'33F4F4FGr/   c                 6    | j                   j                         S r   )r?  r"  r@   s    r.   r"  z"InternVLModel.get_input_embeddings9  s    ""7799r/   c                 :    | j                   j                  |       y r   )r?  set_input_embeddingsr*   rJ   s     r.   rF  z"InternVLModel.set_input_embeddings<  s    007r/   c                     || _         y r   r?  r*   decoders     r.   set_decoderzInternVLModel.set_decoder?  s
    %r/   c                     | j                   S r   rI  r@   s    r.   get_decoderzInternVLModel.get_decoderB  s    """r/   r   vision_feature_layervision_feature_select_strategyc                 J   ||n| j                   j                  }||n| j                   j                  }| j                   j                  }|dk(  r| j	                  |      j
                  }n| j                  |      j                  |   }|dk(  r|ddddddf   }|j                  d   }t        |dz        }|j                  d   }	|j                  |	||d      }| j                  ||      }|j                  |	d|j                  d         }| j                  |      }|S )	a%  
        Obtains image last hidden states from the vision tower and apply multimodal projection.

        Args:
            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
               The tensors corresponding to the input images.
            vision_feature_layer (`int` or `list[int]`):
                Layer index or list of layer indices to extract features from.
        Returns:
            vision_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`.
        Nr1   )r   defaultr   r   r   )scale_factor)rb   rO  rP  r1  rB  r  vision_modelr9   r?   r   r   pixel_shufflerC  )
r*   r   rO  rP  rY   r1  vision_featureschannelsfeature_sizer   s
             r.   get_image_featuresz InternVLModel.get_image_featuresE  sD   & %9$D $++JjJj 	
 .9 +;; 	'  ;;772%"//\/J\\O"//\/JXXYmnO)Y6-aQh7O #((+8S=)$**1-
 *11*lLZ\] ,,_K[,\ *11*b/BWBWXZB[\ 44_Er/   	input_idsinputs_embedsr8  c                 P   |m| | j                         t        j                  | j                  j                  t        j
                  |j                              k(  }|j                  d      }n|| j                  j                  k(  }|j                         }|j                  d      j                  |      j                  |j                        }|j                  d   |j                  d   z  }||   j                         |j                         k7  rt        d| d|       |S )z
        Obtains multimodal placeholdr mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        )r3   devicer1   r   r   z6Image features and image tokens do not match: tokens: z, features )r"  r&   tensorrb   image_token_idlongr]  allsumr   	expand_asr4   r?   numelrj   )r*   rZ  r[  r8  special_image_maskn_image_tokensn_image_featuress          r.   get_placeholder_maskz"InternVLModel.get_placeholder_maskz  s    !.2M$2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!*dkk.H.H!H+//1/99"=GGVYYZgZnZno)//2^5I5I!5LL+,2248L8L8NNHHXXcdtcuv  "!r/   rK   position_idsr*  	use_cacher|   r  return_dictcache_positionrY   r   c                    |
|
n| j                   j                  }
||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }||n| j                   j
                  }|d u |d uz  rt        d      | | j                         |      }|`| j                  |||      }|j                  |j                  |j                        }| j                  |||      }|j                  ||      } | j                  d|||||	|
|d|d	|}t        |j                   |j"                  |j$                  |j&                  |      S d       S )Nz:You must specify exactly one of input_ids or inputs_embedsr   rO  rP  )r[  r8  T)	rK   ri  r*  r[  rj  r|   r  rk  rl  )r  r*  r9   r  r;  r   )rb   r|   r  use_return_dictrO  rP  rj   r"  rY  r4   r]  r3   rh  masked_scatterr?  r:  r  r*  r9   r  )r*   rZ  r   rK   ri  r*  r[  rO  rP  rj  r|   r  rk  rl  rY   r8  re  r   s                     r.   r<   zInternVLModel.forward  s   & 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]$8$D $++JjJj 	
 .9 +;; 	' -t";<YZZ 7D557	BM#!44)%9/M 5 N
 ,..}/C/C]EXEXYN!%!:!:~ "; " *889K^\M%$%% 
)%+'/!5)
 
 +%77#33!//))2>2J
 	

 QU
 	
r/   rV  rS  c           
         |j                         \  }}}}||z  dk7  s||z  dk7  rt        d      |j                  ||t        ||z        t        ||z              }|j	                  dddd      j                         }|j                  |t        ||z        t        ||z        t        ||dz  z              }|j	                  dddd      j                         }|S )a&  Perform pixel shuffle downsampling on vision features.

        Args:
            vision_features (`torch.Tensor`):
                Input tensor of shape (batch_size, width, height, channels).
            scale_factor (`float`, *optional*, defaults to `0.5`):
                Factor by which to downsample. Default is 0.5, which halves the dimensions.

        Returns:
            vision_features (`torch.Tensor`):
                Downsampled tensor of shape (batch_size, height*scale_factor, width*scale_factor, channels/(scale_factor^2)).
        r   zKHeight and width must be divisible by scale_factor for proper downsampling.r   r   r   )r   rj   r   r   r   rX   )r*   rV  rS  r   r   r   rW  s          r.   rU  zInternVLModel.pixel_shuffle  s     />.B.B.D+
E68L A%)=)Bjkk *..s6L#893x,?V;W
 *11!Q1=HHJ *..F\12C8L4MsS[_kmn_nSoOp

 *11!Q1=HHJr/   r   )NNNNNNNNNNNNN)r   )!rB   rC   rD   _checkpoint_conversion_mappingr   r#   r"  rF  rL  rN  r&   r<  r   r   r   liststrrY  
LongTensorrh  r   r   r   r	   r  r   r   r>   r:  r<   floatrU  rE   rF   s   @r.   r>  r>  )  s    '=>N%O"~ :8&# AE8<	3''3 'uS$s)^'<=3 )1	3j"))":?:K:K"]b]n]n"0  '+*.1537+/59@D8<$(,0/3&*59D
##D
 ''D
 !.	D

 u//0D
 "%D
   1 12D
 'uS$s)^'<=D
 )1D
 D>D
 $D>D
 'tnD
 d^D
 !!1!12D
 -.D
  
u11	2!D
  D
L!U\\ ! !r/   r>  zT
    Base class for InternVL causal language model (or autoregressive) outputs.
    c                      e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eeej                        ed<   dZeeej                        ed<   dZeeej                        ed<   dZeej                     ed<   y)	InternVLCausalLMOutputWithPasta]  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nlosslogitsr*  r9   r  r;  )rB   rC   rD   r   ry  r   r&   r<  r   rz  r*  rs  r9   r>   r  r;  r   r/   r.   rx  rx    s      )-D(5$$
%,*.FHU&&'.9=OXd5#4#456=8<M8E%"3"345<59Ju001297;%"3"34;r/   rx  zV
    The INTERNVL model which consists of a vision backbone and a language model.
    c            )           e Zd ZdddddZdgZdef fdZd	 Zd
 Zde	j                  fdZd Zd Z	 	 d&dej                  deeeee   f      dee   fdZed        Zed        Zed        Zee	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d'dej8                  dej                  deej:                     deej8                     dee   deej                     deeeee   f      dee   deej8                     dee   dee   dee   dee   d eej8                     d!eeej:                  f   d"eej:                     d#e e!   dee"e#f   f$d$              Z$	 	 	 	 	 	 d( fd%	Z% xZ&S )) InternVLForConditionalGenerationzmodel.language_modelzmodel.vision_towerzmodel.multi_modal_projectorlm_head)z^language_model.modelz^vision_towerz^multi_modal_projectorz^language_model.lm_headzlm_head.weightrb   c                     t         |   |       t        |      | _        t	        j
                  |j                  j                  |j                  j                  d      | _	        | j                          y )NFrd   )r"   r#   r>  modelr$   rp   r2  r+   
vocab_sizer}  r   r   s     r.   r#   z)InternVLForConditionalGeneration.__init__+  sS     "6*
yy!3!3!?!?ASASA^A^ejkr/   c                 6    | j                   j                         S r   )r  r"  r@   s    r.   r"  z5InternVLForConditionalGeneration.get_input_embeddings1  s    zz..00r/   c                 :    | j                   j                  |       y r   )r  rF  rG  s     r.   rF  z5InternVLForConditionalGeneration.set_input_embeddings4  s    

''.r/   r   c                     | j                   S r   )r}  r@   s    r.   get_output_embeddingsz6InternVLForConditionalGeneration.get_output_embeddings7  s    ||r/   c                 :    | j                   j                  |       y r   )r  rL  rJ  s     r.   rL  z,InternVLForConditionalGeneration.set_decoder:  s    

w'r/   c                 6    | j                   j                         S r   )r  rN  r@   s    r.   rN  z,InternVLForConditionalGeneration.get_decoder=  s    zz%%''r/   r   rO  rP  c                 B     | j                   j                  d|||d|S )Nrn  r   )r  rY  )r*   r   rO  rP  rY   s        r.   rY  z3InternVLForConditionalGeneration.get_image_features@  s5     -tzz,, 
%!5+I
 	
 	
r/   c                 .    | j                   j                  S r   )r  r?  r@   s    r.   r?  z/InternVLForConditionalGeneration.language_modelO  s    zz(((r/   c                 .    | j                   j                  S r   )r  rB  r@   s    r.   rB  z-InternVLForConditionalGeneration.vision_towerS  s    zz&&&r/   c                 .    | j                   j                  S r   )r  rC  r@   s    r.   rC  z6InternVLForConditionalGeneration.multi_modal_projectorW  s    zz///r/   rZ  rK   ri  r*  r[  labelsrj  r|   r  rk  rl  logits_to_keepimage_sizesrY   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }||n| j                   j
                  } | j                  d|||||||||
||d||d|}|d   }t        |t              rt        | d      n|}| j                  |dd|ddf         }d}|	4 | j                  d||	| j                   j                  j                  d|}t        |||j                  |j                   |j"                  |j$                        S )ai  
        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, AutoModelForImageTextToText

        >>> torch_device = "cuda"
        >>> processor = AutoProcessor.from_pretrained("OpenGVLab/InternVL3-1B-hf")
        >>> model = AutoModelForImageTextToText.from_pretrained(
        ...     "OpenGVLab/InternVL3-1B-hf", torch_dtype=torch.bfloat16, device_map=torch_device
        ... )

        >>> messages = [
        ...     {
        ...         "role": "user",
        ...         "content": [
        ...             {
        ...                 "type": "image",
        ...                 "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg",
        ...             },
        ...             {
        ...                 "type": "image",
        ...                 "url": "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg",
        ...             },
        ...             {"type": "text", "text": "These images depict two different landmarks. Can you identify them?"},
        ...         ],
        ...     },
        ... ]

        >>> inputs = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(torch_device)
        >>> generate_ids = model.generate(**inputs, max_new_tokens=200)
        >>> print(processor.decode(generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True))
        The images depict the Statue of Liberty and the Golden Gate Bridge.
        ```NT)rZ  r   rK   ri  r*  r[  rO  rP  rj  r|   r  rk  rl  r  r   )rz  r  r  )ry  rz  r*  r9   r  r;  r   )rb   r|   r  ro  rO  rP  r  r   r   slicer}  loss_functionr2  r  rx  r*  r9   r  r;  )r*   rZ  r   rK   ri  r*  r[  rO  rP  r  rj  r|   r  rk  rl  r  r  rY   r   r9   slice_indicesrz  ry  s                          r.   r<   z(InternVLForConditionalGeneration.forward[  s   r 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]$8$D $++JjJj 	
 .9 +;; 	' $** 
%)%+'!5+I/!5)#
 
$  
8B>SV8W~ot4]kmA}a,?@A%4%% f9P9P9[9[_eD .#33!//)) ' ; ;
 	
r/   c           	      N    t        
|   |f|||||d|}	|d   dk(  r||	d<   |	S )N)r*  r[  rK   rl  r  r   r   )r"   prepare_inputs_for_generation)r*   rZ  r*  r[  r   rK   rl  r  rY   model_inputsr-   s             r.   r  z>InternVLForConditionalGeneration.prepare_inputs_for_generation  sV     w<
+')))
 
 !! ,8L(r/   r   )NNNNNNNNNNNNNNr   N)NNNNNN)'rB   rC   rD   rr  _tied_weights_keysr   r#   r"  rF  r$   Moduler  rL  rN  r&   r<  r   r   r   rs  rt  rY  propertyr?  rB  rC  r   r   ru  r   r	   r  r   r   r>   rx  r<   r  rE   rF   s   @r.   r|  r|    s    "8-"?#,	&" ++~ 1/ryy (( AE8<	
''
 'uS$s)^'<=
 )1	
 ) ) ' ' 0 0  '+*.1537+/59@D8<-1$(,0/3&*5934.2#i
##i
 ''i
 !.	i

 u//0i
 "%i
   1 12i
 'uS$s)^'<=i
 )1i
 ))*i
 D>i
 $D>i
 'tni
 d^i
 !!1!12i
  c5<</0!i
" ell+#i
$ +,%i
& 
u44	5'i
  i
\  r/   r|  )r   r  r(  r>  r|  )r   )Bcollections.abcr   dataclassesr   typingr   r   r   r&   torch.nnr$   activationsr   cache_utilsr	   
generationr
   integrationsr   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   autor   configuration_internvlr   r   r  r    r   rv  r_   ra   r   r   r   r   r   r  r   r   r  r  r(  r.  r:  r>  rx  r|  __all__r   r/   r.   <module>r     s  .  ! , ,   !   ) 7 B 9 d d F & a a  H Y'JBII J (J6 %II%<<% 
% <<	%
 U\\*% % %6Hbii HV KO K K2 
+E  !7BII !7L[7ryy [7|		  3H
I-/4 -/`#
BII #
L 2
7 2
 2
j 'o ' '")) $ 
<"9 < <  
M+ M
M` 
<[ < <2 
B'> B
BJr/   