
    rh                        d Z ddlZddlmZ ddlmZmZmZ ddlZddl	Zddlm
Z
 ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZmZ ddlmZmZ ddlmZm Z m!Z! ddl"m#Z#m$Z$m%Z%m&Z&m'Z' ddl(m)Z)m*Z*  e&jV                  e,      Z-e e$d       G d de#                    Z. G d de
j^                        Z0 G d de
j^                        Z1 G d de
j^                        Z2de1iZ3 G d de
j^                        Z4 G d d e
j^                        Z5 G d! d"e
j^                        Z6 G d# d$e      Z7 G d% d&e
j^                        Z8e$ G d' d(e             Z9 G d) d*e
j^                        Z: G d+ d,e
j^                        Z;	 dId-e
j^                  d.ejx                  d/ejx                  d0ejx                  d1eejx                     d2e=d3e=fd4Z> G d5 d6e
j^                        Z? G d7 d8e      Z@ G d9 d:e
j^                        ZA G d; d<e
j^                        ZB e$d=       G d> d?e9             ZC G d@ dAe
j^                        ZD e$dB       G dC dDe9             ZE e$dE       G dF dGe9e             ZFg dHZGy)JzPyTorch GIT model.    N)	dataclass)CallableOptionalUnion)nn   )ACT2FN)CacheDynamicCache)GenerationMixin)_prepare_4d_attention_mask)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPastBaseModelOutputWithPoolingCausalLMOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputauto_docstringcan_return_tuplelogging	torch_int   )	GitConfigGitVisionConfigz}
    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
    )custom_introc                       e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eeej                  df      ed<   dZeeej                  df      ed<   y)GitVisionModelOutputz
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
        The image embeddings obtained by applying the projection layer to the pooler_output.
    Nimage_embedslast_hidden_state.hidden_states
attentions)__name__
__module____qualname____doc__r#   r   torchFloatTensor__annotations__r$   r%   tupler&        w/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/git/modeling_git.pyr"   r"   4   sr    
 15L(5,,-459x 1 129=AM8E%"3"3S"89:A:>Ju00#567>r0   r"   c                        e Zd ZdZ fdZ	 	 	 	 d	deej                     deej                     deej                     de	dej                  f
dZ xZS )
GitEmbeddingsz;Construct the embeddings from word and position embeddings.c                 B   t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  |j
                        | _	        t        j                  |j
                  |j                        | _
        t        j                  |j                        | _        t        |dd      | _        | j#                  dt%        j&                  |j                        j)                  d      d       y )	N)padding_idxepsposition_embedding_typeabsoluteposition_idsr   F
persistent)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutgetattrr8   register_bufferr+   arangeexpandselfconfig	__class__s     r1   r@   zGitEmbeddings.__init__J   s    !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c  f&8&8f>S>STzz&"<"<='.v7PR\']$ELL)G)GHOOPWXej 	 	
r0   	input_idsr:   inputs_embedspast_key_values_lengthreturnc                 J   ||j                         }n|j                         d d }|d   }|| j                  d d |||z   f   }|| j                  |      }n|}| j                  dk(  r| j	                  |      }||z  }| j                  |      }| j                  |      }|S )Nr<   r   r9   )sizer:   rE   r8   rG   rH   rL   )	rR   rU   r:   rV   rW   input_shape
seq_length
embeddingsrG   s	            r1   forwardzGitEmbeddings.forwardY   s      #..*K',,.s3K ^
,,Q0FVlIl0l-lmL --i8J&J'':5"&":":<"H--J^^J/
\\*-
r0   )NNNr   )r'   r(   r)   r*   r@   r   r+   
LongTensorr,   intTensorr^   __classcell__rT   s   @r1   r3   r3   G   ss    E
" 153759&'E,,- u//0   1 12	
 !$ 
r0   r3   c                        e Zd Zd
 fd	Z	 	 	 	 	 ddej
                  deej                     deej                     dee   dee	   dee	   de
ej
                     fd	Z xZS )GitSelfAttentionc                    t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      || _        |-t        j                  d| j                  j                   d       |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _        t        |j                  j                  |j                  j                   z  dz  d	z         | _        |j$                  | xj"                  |j$                  z  c_        t'        j(                  |j                  | j                        | _        t'        j(                  |j                  | j                        | _        t'        j(                  |j                  | j                        | _        t'        j0                  |j2                        | _        |xs t7        |d
d      | _        | j8                  dk(  s| j8                  dk(  rG|j:                  | _        t'        j<                  d|j:                  z  d	z
  | j                        | _        y y )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()zInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.   r   r8   r9   relative_keyrelative_key_query) r?   r@   rC   num_attention_headshasattr
ValueError	layer_idxloggerwarning_oncerT   r'   r`   attention_head_sizeall_head_sizevision_config
image_size
patch_sizeimage_patch_tokensnum_image_with_embeddingr   LinearquerykeyvaluerJ   attention_probs_dropout_probrL   rM   r8   rF   rA   distance_embeddingrR   rS   r8   ro   rT   s       r1   r@   zGitSelfAttention.__init__x   s(    : ::a?PVXhHi#F$6$6#7 8 445Q8  # !8!8 9 :, , $*#=#= #&v'9'9F<V<V'V#W !558P8PP"%v';';'F'FI]I]IhIh'hmn&nqr&r"s**6##v'F'FF#YYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF'> (
'-zC
$ ''>9T=Y=Y]q=q+1+I+ID(&(ll1v7U7U3UXY3Y[_[s[s&tD# >rr0   r%   attention_mask	head_maskpast_key_valueoutput_attentionspixel_values_presentrX   c           	      d   |j                   \  }}}	| j                  |      j                  |d| j                  | j                        j                  dd      }
|r| j                  nd}| j                  |      j                  |d| j                  | j                        j                  dd      }| j                  |      j                  |d| j                  | j                        j                  dd      }||j                  |d d d d |d d d f   |d d d d |d d d f   | j                        \  }}t        j                  |d d d d d |d d f   |gd      }t        j                  |d d d d d |d d f   |gd      }t        j                  |
|j                  dd            }| j                  dk(  s| j                  dk(  r|
j                   d   |j                   d   }}|Dt        j                  |dz
  t        j                   |j"                  	      j                  dd      }n@t        j$                  |t        j                   |j"                  	      j                  dd      }t        j$                  |t        j                   |j"                  	      j                  dd      }||z
  }| j'                  || j(                  z   dz
        }|j+                  |
j,                  
      }| j                  dk(  rt        j.                  d|
|      }||z   }nE| j                  dk(  r6t        j.                  d|
|      }t        j.                  d||      }||z   |z   }|t1        j2                  | j                        z  }|||z   }t4        j6                  j9                  |d      }| j;                  |      }|||z  }t        j                  ||      }|j=                  dddd      j?                         }|jA                         d d | jB                  fz   }|j                  |      }||fS )Nr<   r   ri   r   dimrj   rk   dtypedevicer   zbhld,lrd->bhlrzbhrd,lrd->bhlrr   )"shaperz   viewrl   rr   	transposerw   r{   r|   updatero   r+   catmatmulr8   tensorlongr   rO   r~   rF   tor   einsummathsqrtr   
functionalsoftmaxrL   permute
contiguousrZ   rs   )rR   r%   r   r   r   r   r   
batch_sizer\   _query_layercutoff	key_layervalue_layerkey_layer_pastvalue_layer_pastattention_scoresquery_length
key_lengthposition_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyattention_probscontext_layernew_context_layer_shapes                                r1   r^   zGitSelfAttention.forward   s    %2$7$7!
JJJ}%T*b$":":D<T<TUYq!_ 	 -A((aHH]#T*b$":":D<T<TUYq!_ 	 JJ}%T*b$":":D<T<TUYq!_ 	
 %/=/D/D!Q*+[Avw9I-JDNN0,N, 		9Q7F7A-=#>"OUVWI))[Aww1A%BDT$U[\]K !<<Y5H5HR5PQ''>9T=Y=Y]q=q'2'8'8';Y__Q=O*L)!&j1nEJJWdWkWk!l!q!q" "'l%**UbUiUi!j!o!oprtu!v"\\*EJJ}OcOcdiijkmopN%6H#'#:#:8dFbFb;bef;f#g #7#:#:ARAR#:#S ++~=+0<<8H+Wk+l(#36N#N --1EE16>NP[]q1r./4||<LiYm/n,#36T#TWs#s +dii8P8P.QQ%/.@ --//0@b/I ,,7  -	9O_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**+BCo--r0   NNNNNFF)r'   r(   r)   r@   r+   ra   r   r,   r
   boolr.   r^   rb   rc   s   @r1   re   re   w   s     uJ 7;15*.,1/4R.||R. !!2!23R. E--.	R.
 !R. $D>R. 'tnR. 
u||	R.r0   re   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )GitSelfOutputc                 (   t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                        | _
        y Nr6   )r?   r@   r   ry   rC   denserH   rI   rJ   rK   rL   rQ   s     r1   r@   zGitSelfOutput.__init__   s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r0   r%   input_tensorrX   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S Nr   rL   rH   rR   r%   r   s      r1   r^   zGitSelfOutput.forward   7    

=1]3}|'CDr0   r'   r(   r)   r@   r+   ra   r^   rb   rc   s   @r1   r   r      1    >U\\  RWR^R^ r0   r   eagerc                        e Zd Zd fd	Zd Z	 	 	 	 	 ddej                  deej                     deej                     dee	   dee
   dee
   d	eej                     fd
Z xZS )GitAttentionc                     t         |           t        |j                     |||      | _        t        |      | _        t               | _        y )N)r8   ro   )	r?   r@   GIT_SELF_ATTENTION_CLASSES_attn_implementationrR   r   outputsetpruned_headsr   s       r1   r@   zGitAttention.__init__  sE    .v/J/JK,Cy
	 $F+Er0   c                 >   t        |      dk(  ry t        || j                  j                  | j                  j                  | j
                        \  }}t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _	        t        | j                  j                  |d      | j                  _        | j                  j                  t        |      z
  | j                  _        | j                  j                  | j                  j                  z  | j                  _        | j
                  j                  |      | _        y )Nr   r   r   )lenr   rR   rl   rr   r   r   rz   r{   r|   r   r   rs   union)rR   headsindexs      r1   prune_headszGitAttention.prune_heads  s   u:?749900$))2O2OQUQbQb
u
 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:r0   r%   r   r   r   r   r   rX   c                 `    | j                  ||||||      \  }}| j                  ||      }	|	|fS r   )rR   r   )
rR   r%   r   r   r   r   r   attn_outputself_attn_weightsattention_outputs
             r1   r^   zGitAttention.forward  sI     *. *
&&  ;;{MB!222r0   r   r   )r'   r(   r)   r@   r   r+   ra   r   r,   r
   r   r.   r^   rb   rc   s   @r1   r   r     s    ";* 7;15*.,1/43||3 !!2!233 E--.	3
 !3 $D>3 'tn3 
u||	3r0   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )GitIntermediatec                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r   )r?   r@   r   ry   rC   intermediate_sizer   
isinstance
hidden_actstrr	   intermediate_act_fnrQ   s     r1   r@   zGitIntermediate.__init__6  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r0   r%   rX   c                 J    | j                  |      }| j                  |      }|S r   )r   r   rR   r%   s     r1   r^   zGitIntermediate.forward>  s&    

=100?r0   r   rc   s   @r1   r   r   5  s#    9U\\ ell r0   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )	GitOutputc                 (   t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        y r   )r?   r@   r   ry   r   rC   r   rH   rI   rJ   rK   rL   rQ   s     r1   r@   zGitOutput.__init__F  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r0   r%   r   rX   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r   r   r   s      r1   r^   zGitOutput.forwardL  r   r0   r   rc   s   @r1   r   r   E  r   r0   r   c                        e Zd Zd fd	Z	 	 	 	 	 ddej
                  deej                     deej                     dee   dee	   dee	   de
ej
                     fd	Zd
 Z xZS )GitLayerc                     t         |           |j                  | _        d| _        t	        ||      | _        t        |      | _        t        |      | _	        y )Nr   )ro   )
r?   r@   chunk_size_feed_forwardseq_len_dimr   	attentionr   intermediater   r   )rR   rS   ro   rT   s      r1   r@   zGitLayer.__init__T  sK    '-'E'E$%f	B+F3'r0   r%   r   r   r   r   r   rX   c                     | j                  ||||||      \  }}t        | j                  | j                  | j                  |      }	|	|fS )N)r   r   r   )r   r   feed_forward_chunkr   r   )
rR   r%   r   r   r   r   r   r   self_attention_weightslayer_outputs
             r1   r^   zGitLayer.forward\  sg     48>>/)!5 4B 4
00 1##T%A%A4CSCSUe
 333r0   c                 L    | j                  |      }| j                  ||      }|S r   )r   r   )rR   r   intermediate_outputr   s       r1   r   zGitLayer.feed_forward_chunkt  s,    "//0@A{{#68HIr0   r   r   )r'   r(   r)   r@   r+   ra   r   r,   r
   r   r.   r^   r   rb   rc   s   @r1   r   r   S  s    ( 7;15*.,1/44||4 !!2!234 E--.	4
 !4 $D>4 'tn4 
u||	40r0   r   c                       e Zd Z fdZ	 	 	 	 	 	 	 	 ddej
                  deej                     deej                     deee	e
e
ej                        f      dee   dee   dee   d	ee   d
ee   dee
ej
                     ef   fdZ xZS )
GitEncoderc           	          t         |           || _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        d| _	        y c c}w NF)
r?   r@   rS   r   
ModuleListrangenum_hidden_layersr   layergradient_checkpointing)rR   rS   irT   s      r1   r@   zGitEncoder.__init__{  sP    ]]vG_G_A`#aAHVQ$7#ab
&+# $bs   A$r%   r   r   past_key_values	use_cacher   output_hidden_statesr   return_dictrX   c
           	         | j                   r%| j                  r|rt        j                  d       d}t	        |t        d       t        f      st        d      |r|
t               }|rdnd }
|rdnd }t        | j                        D ]4  \  }}|r|
|fz   }
|||   nd } |||||||      }|d   }|s,||d   fz   }6 |r|
|fz   }
|	st        d |||
|fD              S t        |||
|      S )	NzZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...FzBThe `past_key_values` should be either a `Cache` object or `None`.r/   r   r   c              3   $   K   | ]  }|| 
 y wr   r/   ).0vs     r1   	<genexpr>z%GitEncoder.forward.<locals>.<genexpr>  s      	 = 	s   r$   r   r%   r&   )r   trainingrp   rq   r   typer
   rn   r   	enumerater   r.   r   )rR   r%   r   r   r   r   r   r   r   r   all_hidden_statesall_self_attentionsr   layer_modulelayer_head_masklayer_outputss                   r1   r^   zGitEncoder.forward  sJ    &&4==##p "	 /DJ+>?abb0*nO"6BD$5b4(4 	POA|#$58H$H!.7.CilO(!$M *!,M &9]1=M<O&O##	P&   1]4D D 	 "#%'		 	 	 '+++*	
 	
r0   )NNNNFFFT)r'   r(   r)   r@   r+   ra   r   r,   r   r
   r.   r   r   r^   rb   rc   s   @r1   r   r   z  s    , 7;15SW$(,1/4/4&*B
||B
 !!2!23B
 E--.	B

 "%uU5;L;L5M/N(N"OPB
 D>B
 $D>B
 'tnB
 'tnB
 d^B
 
uU\\"$;;	<B
r0   r   c                   &    e Zd ZU eed<   dZdZd Zy)GitPreTrainedModelrS   gitTc                    t        |t              rt        j                  j	                  |j
                  d| j                  j                         t        j                  j	                  |j                  j                  | j                  j                         t        j                  j	                  |j                  j                  | j                  j                         t        |t        j                        rm|j                  j                  j	                  d| j                  j                         |j                  %|j                  j                  j                          yyt        |t        j                        rz|j                  j                  j	                  d| j                  j                         |j                   2|j                  j                  |j                      j                          yyt        |t        j"                        rJ|j                  j                  j                          |j                  j                  j%                  d       yy)zInitialize the weights        )meanstd)r  Ng      ?)r   GitVisionEmbeddingsr   initnormal_class_embeddingrS   initializer_rangepatch_embeddingweightposition_embeddingry   databiaszero_rA   r5   rH   fill_)rR   modules     r1   _init_weightsz GitPreTrainedModel._init_weights  s   f12GGOOF22$++B_B_O`GGOOF2299t{{?\?\O]GGOOF55<<$++B_B_O`fbii( MM&&CT[[5R5R&S{{&  &&( '-MM&&CT[[5R5R&S!!-""6#5#56<<> .-KK""$MM$$S) .r0   N)r'   r(   r)   r   r-   base_model_prefixsupports_gradient_checkpointingr   r/   r0   r1   r  r    s    &*#*r0   r  c                        e Zd Zdef fdZdej                  dededej                  fdZd
dej                  dej                  fd	Z
 xZS )r  rS   c                    t         |           || _        |j                  | _        |j
                  | _        |j                  | _        t        j                  t        j                  | j                              | _        t        j                  |j                  | j                  | j                  | j                  d      | _        | j
                  | j                  z  dz  | _        | j                  dz   | _        t        j"                  | j                   | j                        | _        | j'                  dt        j(                  | j                         j+                  d      d       y )NF)in_channelsout_channelskernel_sizestrider  ri   r   r:   r;   r=   )r?   r@   rS   rC   	embed_dimru   rv   r   	Parameterr+   randnr  Conv2dnum_channelsr  num_patchesnum_positionsrA   r  rN   rO   rP   rQ   s     r1   r@   zGitVisionEmbeddings.__init__  s	   ++ ++ ++!||EKK,GH!yy++?? 
 !OOt>1D!--1"$,,t/A/A4>>"R^U\\$:L:L-M-T-TU\-]jopr0   r]   heightwidthrX   c                    |j                   d   dz
  }| j                  j                  j                  d      }|j                   d   dz
  }t        j
                  j                         s%||k(  r ||k(  r| j                  | j                        S |ddddf   }|ddddf   }|j                   d   }	|| j                  z  }
|| j                  z  }t        |dz        }|j                  d|||	      }|j                  dddd      }t        j                  j                  ||
|fdd	
      }|j                  dddd      j                  dd|	      }t	        j                   ||fd      S )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r   Nr<   g      ?r   ri   bicubicF)rZ   modealign_cornersr   )r   r  r  	unsqueezer+   jit
is_tracingr:   rv   r   reshaper   r   r   interpolater   r   )rR   r]   r0  r1  r.  r  r/  class_pos_embedpatch_pos_embedr   
new_height	new_widthsqrt_num_positionss                r1   interpolate_pos_encodingz,GitVisionEmbeddings.interpolate_pos_encoding  sv    !&&q)A-!44;;EEaH*003a7 yy##%+*F6UZ?**4+<+<==,QU3,QU3r"t.
T__,	&}c'9:)11!5GI[]`a)11!Q1=--33i(	 4 
 *11!Q1=BB1b#Nyy/?;CCr0   pixel_valuesc                 `   |j                   \  }}}}|sJ|| j                  k7  s|| j                  k7  r,t        d| d| d| j                   d| j                   d	      | j                  j                  j
                  }| j                  |j                  |            }|j                  d      j                  dd      }| j                  j                  |dd      }	t        j                  |	|gd	      }
|r|
| j                  |
||      z   }
|
S |
| j                  | j                        z   }
|
S )
NzInput image size (*z) doesn't match model ().r   ri   r   r<   r   )r   ru   rn   r  r  r   r   flattenr   r  rP   r+   r   r@  r  r:   )rR   rA  r@  r   r   r0  r1  target_dtypepatch_embedsclass_embedsr]   s              r1   r^   zGitVisionEmbeddings.forward"  s6   '3'9'9$
Avu'Vt-F%SWSbSbJb$VHAeW4KDOOK\\]^b^m^m]nnpq  ++2288++LOO,O,OP#++A.88A>++22:q"EYYl;C
##d&C&CJPVX]&^^J  $d&=&=d>O>O&PPJr0   F)r'   r(   r)   r   r@   r+   ra   r`   r@  r,   r^   rb   rc   s   @r1   r  r    sd    q q,'D5<< 'D 'DUX 'D]b]i]i 'DRE$5$5 Z_ZfZf r0   r  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )GitVisionMLPc                    t         |           || _        t        |j                     | _        t        j                  |j                  |j                        | _
        t        j                  |j                  |j                        | _        y r   )r?   r@   rS   r	   r   activation_fnr   ry   rC   r   fc1fc2rQ   s     r1   r@   zGitVisionMLP.__init__6  sd    #F$5$5699V//1I1IJ99V55v7I7IJr0   r%   rX   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )rN  rM  rO  r   s     r1   r^   zGitVisionMLP.forward=  s4    /**=9/r0   r   rc   s   @r1   rK  rK  5  s$    KU\\ ell r0   rK  r  rz   r{   r|   r   scalingrL   c                    t        j                  ||j                  dd            |z  }|||z   }t        j                  j                  |dt         j                        j                  |j                        }t        j                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )Nr<   r   )r   r   )pr  r   ri   )r+   r   r   r   r   r   float32r   r   rL   r  r   )
r  rz   r{   r|   r   rQ  rL   kwargsattn_weightsr   s
             r1   eager_attention_forwardrW  E  s     <<s}}R'<=GL!#n4==((2U]](SVVW\WbWbcL==((6??([L,,|U3K''1-88:K$$r0   c                        e Zd ZdZ fdZ	 	 	 d	dej                  deej                     deej                     dee   de	ej                  eej                     f   f
dZ
 xZS )
GitVisionAttentionz=Multi-headed attention from 'Attention Is All You Need' paperc                    t         |           || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        | j                  | j                  z  | j                  k7  r&t        d| j                   d| j                   d      | j                  dz  | _	        |j                  | _        d| _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: rD  g      F)r?   r@   rS   rC   r)  rl   	num_headshead_dimrn   scaleattention_dropoutrL   	is_causalr   ry   k_projv_projq_projout_projrQ   s     r1   r@   zGitVisionAttention.__init___  s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..Ar0   r%   r   causal_attention_maskr   rX   c           
         |j                   \  }}}| j                  |      }| j                  |      }	| j                  |      }
|j	                  ||| j
                  | j                        j                  dd      }|	j	                  ||| j
                  | j                        j                  dd      }	|
j	                  ||| j
                  | j                        j                  dd      }
| j                  j                  dk7  r||||z   }n||}n	|du| _
        t        }| j                  j                  dk7  rN| j                  j                  dk(  r|rt        j                  d       nt        | j                  j                     } || ||	|
|| j                  | j                  | j                   sdn| j"                  	      \  }}|j%                  |||      j'                         }| j)                  |      }|sd}||fS )
z#Input shape: Batch x Time x Channelr   ri   flash_attention_2Nr   sdpaz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.r  )r_  rQ  rL   )r   rb  r`  ra  r   r[  r\  r   rS   r   r_  rW  rp   rq   r   r]  r  rL   r9  r   rc  )rR   r%   r   rd  r   r   r\   r)  querieskeysvaluesattention_interfacer   rV  s                 r1   r^   zGitVisionAttention.forwards  s    -:,?,?)
J	++m,{{=)]+,,z:t~~t}}U__`acdeyyZOYYZ[]^_ZT^^T]]S]]^_abc ;;++/BB).C.O!/2G!G&2!62$>DN(?;;++w6{{//69>O##L
 '>dkk>^>^&_#$7nnJJ#}}C$,,	%
!\ "))*j)LWWYmmK0 LL((r0   )NNF)r'   r(   r)   r*   r@   r+   ra   r   r   r.   r^   rb   rc   s   @r1   rY  rY  \  s}    GB. 268<,15)||5) !.5)  (5	5)
 $D>5) 
u||Xell33	45)r0   rY  c                        e Zd Zdef fdZ	 d	dej                  dej                  dej                  dee   de	ej                     f
dZ xZS )
GitVisionEncoderLayerrS   c                 D   t         |           |j                  | _        t	        |      | _        t        j                  | j                  |j                        | _	        t        |      | _        t        j                  | j                  |j                        | _        y r   )r?   r@   rC   r)  rY  	self_attnr   rH   rI   layer_norm1rK  mlplayer_norm2rQ   s     r1   r@   zGitVisionEncoderLayer.__init__  sm    +++F3<<F<Q<QR'<<F<Q<QRr0   r%   r   rd  r   rX   c                     |}| j                  |      }| j                  ||||      \  }}||z   }|}| j                  |      }| j                  |      }||z   }|f}|r||fz  }|S )aI  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )r%   r   rd  r   )rp  ro  rr  rq  )rR   r%   r   rd  r   residualrV  outputss           r1   r^   zGitVisionEncoderLayer.forward  s    " !((7&*nn')"7/	 '5 '
#| !=0 ((7/ =0 "&Gr0   rI  )r'   r(   r)   r   r@   r+   ra   r   r   r.   r,   r^   rb   rc   s   @r1   rm  rm    sf    S S -2&||& &  %||	&
 $D>& 
u  	!&r0   rm  c                        e Zd ZdZdef fdZe	 	 	 	 	 ddeej                     deej                     dee
   dee
   dee
   d	eeef   fd
       Z xZS )GitVisionEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`GitVisionEncoderLayer`].

    Args:
        config: GitVisionConfig
    rS   c                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w r   )
r?   r@   rS   r   r   r   r   rm  layersr   )rR   rS   r   rT   s      r1   r@   zGitVisionEncoder.__init__  sP    mmERXRjRjLk$lq%:6%B$lm&+# %ms   A#r   rd  r   r   r   rX   c                 j   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|rdnd}|rdnd}|}	t	        | j
                        D ]*  \  }
}|r||	fz   } ||	|||      }|d   }	|s"||d   fz   }, |r||	fz   }t        |	||      S )a  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Causal mask for the text model. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr/   )r   r   r   r$   r%   r&   )rS   r   r   use_return_dictr  ry  r   )rR   rV   r   rd  r   r   r   encoder_statesall_attentionsr%   idxencoder_layerr  s                r1   r^   zGitVisionEncoder.forward  s    N 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]30d%"+DKK"8 	FC#!/=2B!B)%"3	M *!,M !/=3C2E!E	F  +}.>>N+>Vd
 	
r0   )NNNNN)r'   r(   r)   r*   r   r@   r   r   r+   ra   r   r   r.   r   r^   rb   rc   s   @r1   rw  rw    s    , ,  268<,0/3&*D
 !.D
  (5	D

 $D>D
 'tnD
 d^D
 
uo%	&D
 D
r0   rw  c                        e Zd Zdef fdZe	 	 	 	 	 d
deej                     dee	   dee	   dee	   dee	   de
eef   fd	       Z xZS )GitVisionTransformerrS   c                     t         |           || _        |j                  }t	        |      | _        t        j                  ||j                        | _	        t        |      | _        t        j                  ||j                        | _        y r   )r?   r@   rS   rC   r  r]   r   rH   rI   pre_layrnormrw  encoderpost_layernorm)rR   rS   r)  rT   s      r1   r@   zGitVisionTransformer.__init__8  sj    &&	-f5LL8M8MN'/ ll9&:O:OPr0   rA  r   r   r@  r   rX   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|t	        d      | j                  ||      }| j                  |      }| j                  ||||      }|d   }| j                  |      }|s	|f|dd  z   S t        ||j                  |j                        S )Nz You have to specify pixel_valuesr@  )rV   r   r   r   r   r   r{  )rS   r   r   r|  rn   r]   r  r  r  r   r%   r&   )	rR   rA  r   r   r@  r   r%   encoder_outputsr$   s	            r1   r^   zGitVisionTransformer.forwardB  s     2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]?@@Ogh))-8,,'/!5#	 ' 
 ,A. //0AB%'/!"*===/)77&11
 	
r0   NNNFN)r'   r(   r)   r   r@   r   r   r+   r,   r   r   r.   r   r^   rb   rc   s   @r1   r  r  6  s    Q Q  59,0/338&*&
u001&
 $D>&
 'tn	&

 #+4.&
 d^&
 
uo%	&&
 &
r0   r  zY
    The vision model from CLIP, used in GIT, without any head or projection on top.
    c                        e Zd ZU eed<   dZdef fdZdej                  fdZ	e
	 	 	 	 	 ddeej                     dee   dee   ded	ee   deeef   fd
       Z xZS )GitVisionModelrS   rA  c                 d    t         |   |       t        |      | _        | j	                          y r   )r?   r@   r  vision_model	post_initrQ   s     r1   r@   zGitVisionModel.__init__v  s'     08r0   rX   c                 B    | j                   j                  j                  S r   )r  r]   r  rR   s    r1   get_input_embeddingsz#GitVisionModel.get_input_embeddings|  s      ++;;;r0   r   r   r@  r   c                 b    ||n| j                   j                  }| j                  |||||      S )a{  
        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, GitVisionModel

        >>> processor = AutoProcessor.from_pretrained("microsoft/git-base")
        >>> model = GitVisionModel.from_pretrained("microsoft/git-base")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        ```)rA  r   r   r@  r   )rS   r|  r  )rR   rA  r   r   r@  r   s         r1   r^   zGitVisionModel.forward  sA    8 &1%<k$++B]B]  %/!5%=# ! 
 	
r0   r  )r'   r(   r)   r   r-   main_input_namer@   r   Moduler  r   r   r+   r,   r   r   r.   r   r^   rb   rc   s   @r1   r  r  l  s     $O <bii <  59,0/3).&*#
u001#
 $D>#
 'tn	#

 #'#
 d^#
 
uo%	&#
 #
r0   r  c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )GitProjectionrS   c                 0   t         |           || _        t        j                  t        j
                  |j                  j                  |j                        t        j                  |j                  |j                  j                              | _
        y r   )r?   r@   rS   r   
Sequentialry   rt   rC   rH   rI   visual_projectionrQ   s     r1   r@   zGitProjection.__init__  sf    !#IIf**668J8JKLL++1E1E1T1TU"
r0   r]   rX   c                 $    | j                  |      S r   )r  )rR   r]   s     r1   r^   zGitProjection.forward  s    %%j11r0   )	r'   r(   r)   r   r@   r+   ra   r^   rb   rc   s   @r1   r  r    s*    
y 
2%,, 25<< 2r0   r  zy
    The bare GIT Model transformer consisting of a CLIP image encoder and text decoder outputting raw hidden-states
    c                       e Zd Z fdZd Zd Zd Zdedej                  dej                  dej                  fd	Zdd
Ze	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deeeeej&                     f      dee   dee   dee   dedee   deeej                     ef   fd       Z xZS )GitModelc                 l   t         |          | _        t              | _        t        j                        | _        t              | _	        t              | _        j                  6t        j                  fdt        j                        D              | _        | j#                          y )Nc              3      K   | ]B  }t        j                  t        j                  d d j                  j
                               D yw)r   N)r   r*  r+   zerosrt   rC   )r   r   rS   s     r1   r  z$GitModel.__init__.<locals>.<genexpr>  s;      ; U[[Av/C/C/O/OPQ;s   AA)r?   r@   rS   r3   r]   r  rt   image_encoderr   r  r  r  rx   r   ParameterListr   img_temperal_embeddingr  rQ   s    `r1   r@   zGitModel.__init__  s     '/+F,@,@A!&)!.v!6**6*,*:*: ;v>>?; +D' 	r0   c                 .    | j                   j                  S r   r]   rE   r  s    r1   r  zGitModel.get_input_embeddings  s    ...r0   c                 &    || j                   _        y r   r  )rR   r|   s     r1   set_input_embeddingszGitModel.set_input_embeddings  s    */'r0   c                     |j                         D ]7  \  }}| j                  j                  |   j                  j	                  |       9 y)z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr  r   r   r   )rR   heads_to_pruner   r   s       r1   _prune_headszGitModel._prune_heads  sE    
 +002 	CLE5LLu%//;;EB	Cr0   rZ   r   r   rX   c                     t        j                  t        j                  ||||      d      }|j                  |dk(  t	        d            }|S )Nr   r   r   )diagonal-inf)r+   triuonesmasked_fillfloat)rR   rZ   r   r   masks        r1   _generate_future_maskzGitModel._generate_future_mask  sA    zz%**T4eLWXY	5=9r0   c                    |j                   d   }|j                   d   }|j                  }|j                  }	t        j                  ||f||	      }
t        j
                  |||z   ft        d      |j                  |	      }t        j                  ||f|	|j                        }|dkD  rAt        j                  |j                   d   |j                   d   |z   f|	|j                        }t        j                  |
|fd      }t        j                  ||j                  |	      fd      }t        j                  ||fd      d d d f   }|4t        j
                  |j                   d   |j                   d   fd|      }|j                  t        j                  k7  rt        d	      t        j                  ||j                  
      }t        d      ||<   |j                  |j                   d   ||z   ||z   |z   f      }|j                         }|d d d d d |f   }|d d d d d f   }||z   |d d d d d |f<   |d d d d d d d f   }|S )Nr   r  r  r   r   r   F)
fill_valuer   z1Memory key padding mask must be a boolean tensor.r   )r   r   r   r+   r  fullr  r   r   r   rn   
zeros_likerP   clone)rR   tgtmemorytgt_maskrW   memory_key_padding_masknum_tgt
num_memoryr   r   top_left	top_rightbottom_leftleftrightfull_attention_maskzero_negative_infinityorigin_leftr   s                      r1   create_attention_maskzGitModel.create_attention_mask  sF   ))A,\\!_
		;;
J7eTJJ#99:&M::	
	 kkj!??
 "A%{{"HNN1$58N$NOH yy(K0a8		9hkk%&89qA#iiu1=dAgF"*&+jj&,,q/6<<PQ?1S`ent&u#"((EJJ6PQQ!&!1!12IQTQZQZ![:?-67188$**1-zG/CZRhEhkrErs
 2779)!Q*;<'4
31<v1EAq+:+-. 2!T1a-@""r0   rU   r   r:   rA  r   rV   r   r   r   r   r@  r   c                 \   |	|	n| j                   j                  }	|
|
n| j                   j                  }
||n| j                   j                  }||n| j                   j                  }||t        d      |#| j                  ||       |j                         }n!||j                         dd }nt        d      |d   }d}|0t        |t              s|j                         n|j                         }| j                  || j                   j                        }d}||j                  dk(  r| j                  ||      j                  }n|j                  d	k(  rg }t!        |j"                  d         D ]O  }| j                  |dd|ddddf   |      j                  }|| j$                  |   z  }|j'                  |       Q t)        j*                  |d
      }nt        d      | j-                  |      }| j/                  ||||      }|It)        j0                  |j"                  d   d|j"                  d   f|j2                  |j4                        }|j7                  |j                  d      |j                  d      z  dd      }t)        j*                  ||fd
      }| j9                  ||j2                  |j4                        }| j;                  ||||      }|mt=        ||j2                  |d         j?                  |j4                        }|dkD  r|dddd| dddf   }n!|dddd|d    d|d    dfxx   |z  cc<   | jA                  ||||||	|
||du	      }|d   }|s	|f|dd z   S tC        ||jD                  |jF                  |jH                        S )a  
        Examples:

        ```python
        >>> from transformers import AutoProcessor, AutoModel
        >>> import requests
        >>> from PIL import Image

        >>> processor = AutoProcessor.from_pretrained("microsoft/git-base")
        >>> model = AutoModel.from_pretrained("microsoft/git-base")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> text = "this is an image of two cats"

        >>> inputs = processor(images=image, text=text, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        ```NzDYou cannot specify both input_ids and inputs_embeds at the same timer<   z5You have to specify either input_ids or inputs_embedsr   r      r     r   z#pixel_values must be of rank 4 or 5)rU   r:   rV   rW   ri   r   )r  r  r  rW   )tgt_len)r   r   r   r   r   r   r   r   r  )%rS   r   r   r   r|  rn   %warn_if_padding_and_no_attention_maskrZ   r   r
   get_seq_lengthget_head_maskr   ndimr  r$   r   r   r  appendr+   r   r  r]   r  r   r   repeatr  r  r   r   r  r   r   r%   r&   )rR   rU   r   r:   rA  r   rV   r   r   r   r   r@  r   r[   r\   rW   projected_visual_featuresvisual_features	frame_idxvisual_features_frameembedding_outputr%   r  combined_attention_maskexpanded_attn_maskr  sequence_outputs                              r1   r^   zGitModel.forward  s7   J 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B] ]%>cdd"66y.Q#..*K&',,.s3KTUU ^
 "#& "/59  ..0$335 # &&y$++2O2OP	$(!#  A%"&"4"4 ;S #5 ###   ""a'"$!&|'9'9!'<!= BI,0,>,>$Q	1a%78Sk -? -'' * *T-H-H-SS)#**+@AB #())O"C !!FGG(,(>(>(O%??%'#9	 + 
 %,(-!''*A/?/E/Ea/HI&,,'..)% %>$D$D!!!$(A(F(Fq(II1a%
!
 		#<>N"OUVW --j:J:P:PRbRiRij #'"<"< ,#9	 #= #
 % "< 0 6 6B"b!(()  &)%71?U>U>VXY8Y%Z"'1{1~o.?+a.AR(RSWiiS,,2+/!5#!-T!9 ' 

 *!,#%(;;;&-+;;)77&11	
 	
r0   r   )NNNNNNNNNNFN)r'   r(   r)   r@   r  r  r  r`   r+   r   r   ra   r  r  r   r   r   r
   listr,   r   r.   r   r^   rb   rc   s   @r1   r  r    s   &/0C# ekk 5<< \a\h\h 0#d  -115/3/3,004KO$(,0/3).&*c
ELL)c
 !.c
 u||,	c

 u||,c
 ELL)c
  -c
 "%tE4E4E/F(F"GHc
 D>c
 $D>c
 'tnc
 #'c
 d^c
 
uU\\"$>>	?c
 c
r0   r  z`
    GIT Model with a `language modeling` head on top for autoregressive language modeling.
    c                        e Zd ZdgZ fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 ddee	j                     dee	j                     dee	j                     dee	j                     d	ee	j                     d
ee	j                     dee	j                     deeeee	j                     f      dee   dee   dee   dedee   deee	j                     ef   fd       Z	 ddZ xZS )GitForCausalLMzoutput.weightc                     t         |   |       t        |      | _        t	        j
                  |j                  |j                        | _        | j                          y r   )
r?   r@   r  r  r   ry   rC   rB   r   r  rQ   s     r1   r@   zGitForCausalLM.__init__  sF     F#ii 2 2F4E4EF 	r0   c                     | j                   S r   r   r  s    r1   get_output_embeddingsz$GitForCausalLM.get_output_embeddings  s    {{r0   c                     || _         y r   r  )rR   new_embeddingss     r1   set_output_embeddingsz$GitForCausalLM.set_output_embeddings  s	    $r0   rU   r   r:   rA  r   rV   labelsr   r   r   r   r@  r   rX   c                    ||n| j                   j                  }|d}	| j                  ||||||||	|
|||      }|d   }| j                  |      }d}|| j                  j                  j
                  d   j                  j                  j                  }|dd|dddf   j                         }|ddddf   j                         } | j                  |j                  d| j                   j                        |j                  d      fd| j                   j                  i|}|s|f|dd z   }||f|z   S |S t        |||j                  |j                  |j                         S )	a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`

        Examples:

        Image captioning example:

        ```python
        >>> from transformers import AutoProcessor, AutoModelForCausalLM
        >>> import requests
        >>> from PIL import Image

        >>> processor = AutoProcessor.from_pretrained("microsoft/git-base-coco")
        >>> model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-coco")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> pixel_values = processor(images=image, return_tensors="pt").pixel_values

        >>> generated_ids = model.generate(pixel_values=pixel_values, max_length=50)
        >>> generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        >>> print(generated_caption)
        two cats sleeping on a pink blanket next to remotes.
        ```

        Visual question answering (VQA) example:

        ```python
        >>> from transformers import AutoProcessor, AutoModelForCausalLM
        >>> from huggingface_hub import hf_hub_download
        >>> from PIL import Image

        >>> processor = AutoProcessor.from_pretrained("microsoft/git-base-textvqa")
        >>> model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-textvqa")

        >>> file_path = hf_hub_download(repo_id="nielsr/textvqa-sample", filename="bus.png", repo_type="dataset")
        >>> image = Image.open(file_path).convert("RGB")

        >>> pixel_values = processor(images=image, return_tensors="pt").pixel_values

        >>> question = "what does the front of the bus say at the top?"

        >>> input_ids = processor(text=question, add_special_tokens=False).input_ids
        >>> input_ids = [processor.tokenizer.cls_token_id] + input_ids
        >>> input_ids = torch.tensor(input_ids).unsqueeze(0)

        >>> generated_ids = model.generate(pixel_values=pixel_values, input_ids=input_ids, max_length=50)
        >>> print(processor.batch_decode(generated_ids, skip_special_tokens=True))
        ['what does the front of the bus say at the top? special']
        ```

        Video captioning example:

        ```python
        >>> import av
        >>> import numpy as np
        >>> from PIL import Image
        >>> from huggingface_hub import hf_hub_download
        >>> from transformers import AutoProcessor, AutoModelForCausalLM

        >>> processor = AutoProcessor.from_pretrained("microsoft/git-base-vatex")
        >>> model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-vatex")

        >>> # set seed for reproducibility
        >>> np.random.seed(45)


        >>> def read_video_pyav(container, indices):
        ...     '''
        ...     Decode the video with PyAV decoder.
        ...     Args:
        ...         container (`av.container.input.InputContainer`): PyAV container.
        ...         indices (`list[int]`): List of frame indices to decode.
        ...     Returns:
        ...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
        ...     '''
        ...     frames = []
        ...     container.seek(0)
        ...     start_index = indices[0]
        ...     end_index = indices[-1]
        ...     for i, frame in enumerate(container.decode(video=0)):
        ...         if i > end_index:
        ...             break
        ...         if i >= start_index and i in indices:
        ...             frames.append(frame)
        ...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])


        >>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
        ...     '''
        ...     Sample a given number of frame indices from the video.
        ...     Args:
        ...         clip_len (`int`): Total number of frames to sample.
        ...         frame_sample_rate (`int`): Sample every n-th frame.
        ...         seg_len (`int`): Maximum allowed index of sample's last frame.
        ...     Returns:
        ...         indices (`list[int]`): List of sampled frame indices
        ...     '''
        ...     converted_len = int(clip_len * frame_sample_rate)
        ...     end_idx = np.random.randint(converted_len, seg_len)
        ...     start_idx = end_idx - converted_len
        ...     indices = np.linspace(start_idx, end_idx, num=clip_len)
        ...     indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
        ...     return indices


        >>> # load video
        >>> file_path = hf_hub_download(
        ...     repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
        ... )
        >>> container = av.open(file_path)

        >>> # sample frames
        >>> num_frames = model.config.num_image_with_embedding
        >>> indices = sample_frame_indices(
        ...     clip_len=num_frames, frame_sample_rate=4, seg_len=container.streams.video[0].frames
        ... )
        >>> frames = read_video_pyav(container, indices)

        >>> pixel_values = processor(images=list(frames), return_tensors="pt").pixel_values

        >>> generated_ids = model.generate(pixel_values=pixel_values, max_length=50)

        >>> print("Generated caption:", processor.batch_decode(generated_ids, skip_special_tokens=True))
        Generated caption: ['a woman is sitting at a table and she is talking about the food she is holding.']
        ```
        NF)r   r:   rA  r   rV   r   r   r   r   r@  r   r   r<   r   rB   )losslogitsr   r%   r&   )rS   r|  r  r   r  r   r   rR   rw   r   loss_functionr   rB   r   r   r%   r&   )rR   rU   r   r:   rA  r   rV   r  r   r   r   r   r@  r   rU  ru  r  r  r  num_image_tokensshifted_logitsr   s                         r1   r^   zGitForCausalLM.forward  s   j &1%<k$++B]B]I(()%%'+/!5%=#  
 "!*_-#xx//55a8BBGGZZ#A'7':A$=>IIKNAqrE]--/F%4%%##B(>(>?B  ;;11 	D Y,F)-)9TGf$EvE%#33!//))
 	
r0   c                     |B|j                         }|j                  d   |kD  r|}n|j                  d   dz
  }|d d |d f   }|j                  }||j                  |      }|||j                  d      ||dS )Nr   rA  )rU   r   rA  r   r   )r  r   new_onesget)	rR   rU   r   r   r   rU  past_lengthremove_prefix_lengthr[   s	            r1   prepare_inputs_for_generationz,GitForCausalLM.prepare_inputs_for_generation  s     &)88:K q!K/'2$ (1q'9A'=$!!%9%:":;I  oo!&//<N #,"JJ~6."
 	
r0   )NNNNNNNNNNNFN)NNN)r'   r(   r)   _tied_weights_keysr@   r  r  r   r   r+   ra   r   r
   r  r   r.   r   r^   r  rb   rc   s   @r1   r  r    s|    **%  -115/3/3,004)-FJ$(,0/3).&*A
ELL)A
 !.A
 u||,	A

 u||,A
 ELL)A
  -A
 &A
 "%tELL/A(A"BCA
 D>A
 $D>A
 'tnA
 #'A
 d^A
  
uU\\"$::	;!A
 A
H OS
r0   r  )r  r  r  r  )r  )Hr*   r   dataclassesr   typingr   r   r   r+   torch.utils.checkpointr   activationsr	   cache_utilsr
   r   
generationr   modeling_attn_mask_utilsr   modeling_layersr   modeling_outputsr   r   r   r   modeling_utilsr   r   pytorch_utilsr   r   r   utilsr   r   r   r   r   configuration_gitr   r   
get_loggerr'   rp   r"   r  r3   re   r   r   r   r   r   r   r   r  r  rK  ra   r  rW  rY  rm  rw  r  r  r  r  r  __all__r/   r0   r1   <module>r     s      ! , ,    ! . ) B 9  G l l  : 
		H	% 	?; 	? 	?-BII -`u.ryy u.rBII   
.3299 .3dbii  		 $) $NI
 I
X * * *6P")) Pf299 . %II%<<% 
% <<	%
 U\\*% % %.L) L)`/6 /fT
ryy T
n3
299 3
l 
2
' 2

2
j
2BII 
2 
~
! ~

~
B 
s
' s

s
l Qr0   