
    rh                        d dl mZ d dlmZmZmZ d dlZd dlmZ ddlm	Z	 ddl
mZ ddlmZmZ dd	lmZmZ dd
lmZmZmZmZ ddlmZ  ej2                  e      Ze ed       G d de                    Ze ed       G d de                    Z G d dej<                        Z G d dej<                        Z 	 dJdej<                  dejB                  dejB                  dejB                  deejB                     de"de"fdZ#d  Z$ G d! d"ej<                        Z%dKd#ejB                  d$e"d%e&d&ejB                  fd'Z' G d( d)ej<                        Z( G d* d+ej<                        Z) G d, d-e      Z* G d. d/ej<                        Z+d0ejB                  d1e,ejB                     d&ejB                  fd2Z- G d3 d4ej<                        Z. G d5 d6ej<                        Z/ G d7 d8ej<                        Z0 G d9 d:ej<                        Z1 G d; d<e      Z2 G d= d>e      Z3 G d? d@ej<                        Z4e G dA dBe             Z5dC Z6e G dD dEe5             Z7 edF       G dG dHe5             Z8g dIZ9y)L    )	dataclass)CallableOptionalUnionN)nn   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputImageClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)ModelOutputauto_docstringcan_return_tuplelogging   )VJEPA2ConfigzO
    VJEPA Predictor outputs that also contains the masked encoder outputs
    )custom_introc                       e Zd ZU dZej
                  ed<   dZeej
                     ed<   dZ	ee
ej
                  df      ed<   dZee
ej
                  df      ed<   dZeej
                     ed<   y)	$VJEPA2WithMaskedInputPredictorOutputa  
    masked_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*, returned when `context_mask` is provided which is applied on VJEPA2Encoder outputs):
        The masked hidden state of the model.
    target_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*, returned when `target_mask` is provided which is applied on VJEPA2Encoder outputs):
        The target hidden state of the model.
    last_hidden_stateNmasked_hidden_state.hidden_states
attentionstarget_hidden_state)__name__
__module____qualname____doc__torchFloatTensor__annotations__r   r   r   tupler   r        }/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/vjepa2/modeling_vjepa2.pyr   r       s     (((7;%"3"34;=AM8E%"3"3S"89:A:>Ju00#567>7;%"3"34;r&   r   zs
    VJEPA outputs that also contains the masked encoder outputs
    Optionally contains the predictor outputs
    c                        e Zd ZU dZej
                  ed<   dZeej
                     ed<   dZ	ee
ej
                  df      ed<   dZee
ej
                  df      ed<   dZee   ed<    fd	Z xZS )
 VJEPA2WithMaskedInputModelOutputaq  
    masked_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*, returned when `context_mask` is provided which is applied on VJEPA2Encoder outputs):
        The masked hidden state of the model.
    predictor_output (`VJEPA2WithMaskedInputPredictorOutput`, *optional*):
        The output from the Predictor module.
    r   Nr   .r   r   predictor_outputc                     t        t        | 	               }t        |d   t              r|d   j                         |d<   t        |      S )N)listsuperto_tuple
isinstancer   r$   )selfoutput	__class__s     r'   r/   z)VJEPA2WithMaskedInputModelOutput.to_tupleJ   sD    eg&()fRj"FG,,.F2JV}r&   )r   r   r   r    r!   r"   r#   r   r   r   r$   r   r*   r   r/   __classcell__r3   s   @r'   r)   r)   5   s     (((7;%"3"34;=AM8E%"3"3S"89:A:>Ju00#567>GKhCDK r&   r)   c                   x     e Zd ZdZ	 d	dedef fdZed        Zde	j                  de	j                  fdZ xZS )
VJEPA2PatchEmbeddings3Dz"
    Image to Patch Embedding
    confighidden_sizec                 H   t         |           |j                  | _        |j                  | _        || _        t        j                  |j                  ||j                  |j                  |j                  f|j                  |j                  |j                  f      | _        y )N)in_channelsout_channelskernel_sizestride)	r.   __init__
patch_sizetubelet_sizer9   r   Conv3din_chansprojr1   r8   r9   r3   s      r'   r?   z VJEPA2PatchEmbeddings3D.__init__V   s    
 	 ++"//&II$,,f.?.?ARARS''):):F<M<MN	
	r&   c                     | j                   | j                  z  | j                  | j                  z  z  | j                  | j                  z  z  S Nframes_per_cliprA   	crop_sizer@   r8   s    r'   num_patchesz#VJEPA2PatchEmbeddings3D.num_patchesg   sO     ##v':'::6#4#4466#4#446	
r&   pixel_values_videosreturnc                 f    | j                  |      j                  d      j                  dd      }|S )N   r   )rD   flatten	transpose)r1   rM   xs      r'   forwardzVJEPA2PatchEmbeddings3D.forwardo   s.    II)*2215??1Er&      )r   r   r   r    r   intr?   staticmethodrL   r!   TensorrT   r4   r5   s   @r'   r7   r7   Q   sS      

 
" 
 
5<< ELL r&   r7   c                   f     e Zd ZdZddedef fdZdej                  dej                  fdZ	 xZ
S )	VJEPA2Embeddings>
    Construct mask token, position and patch embeddings.
    r8   r9   c                     t         |           || _        || _        t	        ||      | _        | j
                  j                  | _        |j                  | _        y )Nr9   )r.   r?   r8   r9   r7   patch_embeddingsrL   r@   rE   s      r'   r?   zVJEPA2Embeddings.__init__y   sM    & 7K X00<< ++r&   rM   rN   c                 l   |j                   d   }|j                  ddddd      }|| j                  j                  k  r)|j	                  dd| j                  j                  dd      }| j
                  j                  j                  j                  }|j                  |      }| j                  |      }|S )Nr   r   rP   r      )dtype)
shapepermuter8   rA   repeatr_   rD   weightrb   to)r1   rM   
num_framestarget_dtype
embeddingss        r'   rT   zVJEPA2Embeddings.forward   s    (..q1
 299!Q1aH 000"5"<"<Q4;;C[C[]^`a"b,,1188>>144<4H**+>?
r&   rU   )r   r   r   r    r   rW   r?   r!   rY   rT   r4   r5   s   @r'   r[   r[   t   s6    ,| ,# ,5<< ELL r&   r[   modulequerykeyvalueattention_maskscalingdropoutc                    t        j                  ||j                  dd            |z  }t        j                  j                  |dt         j                        j                  |j                        }t        j                  j                  ||| j                        }|||z  }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )Nr,   )dimrb   )ptrainingr   rP   )r!   matmulrR   r   
functionalsoftmaxfloat32rg   rb   rq   rv   
contiguous)
rk   rl   rm   rn   ro   rp   rq   kwargsattn_weightsattn_outputs
             r'   eager_attention_forwardr      s     <<s}}R'<=GL ==((2U]](SVVW\WbWbcL ==((6??([L !#n4,,|U3K''1-88:K$$r&   c                 X   | j                         \  }}}}t        j                  |dz  | j                  | j                        }||dz  z  }dd|z  z  }|j                  d      |z  }|j                         }|j                         }	|j                  d      j                  dddd      }|	j                  d      j                  dddd      }	| j                  dd      }
|
j                  d	      \  }}t        j                  | |fd	      }
|
j                  d
      }
| |	z  |
|z  z   S )NrP   rb   deviceg       @      ?i'  r,   r   )r,   rP   rt   rs   )sizer!   arangerb   r   	unsqueezesincossqueezere   	unflattenunbindstackrQ   )rS   posB	num_headsNDomegafreqemb_sinemb_cosyy1y2s                r'   rotate_queries_or_keysr      s   Ay!Q
 LLaqwwqxx@E	QWE%,E==u$D hhjGhhjGoob!((Aq!4Goob!((Aq!4G 	
B AXX"XFBbS"I2&A			"AKAK((r&   c                        e Zd Z	 	 ddededef fdZd Zd ZddZd Z		 	 	 dd	e
ej                     d
ede
ej                     deeej                  ej                  f   eej                     f   fdZ xZS )VJEPA2RopeAttentionr8   r9   num_attention_headsc                 z   t         |           || _        || _        || _        ||z  dk7  rt        d|f d| d      t        ||z        | _        | j                  | j                  z  | _        t        j                  || j                  |j                        | _        t        j                  || j                  |j                        | _        t        j                  || j                  |j                        | _        t        j                  ||      | _        |j                   | _        t        j$                  | j"                        | _        | j                  j(                  | j                  j*                  z  | _        | j                  j.                  | j                  j0                  z  | _        t        d| j                  dz  dz  z        | _        t        d| j                  dz  dz  z        | _        t        d| j                  dz  dz  z        | _        | j                  dz  | _        d	| _        y )
Nr   zThe hidden size z4 is not a multiple of the number of attention heads .biasrP   r         F)r.   r?   r8   r9   r   
ValueErrorrW   attention_head_sizeall_head_sizer   Linearqkv_biasrl   rm   rn   rD   attention_probs_dropout_probdropout_probDropoutrq   rJ   r@   	grid_sizerI   rA   
grid_depthd_dimh_dimw_dimrp   	is_causal)r1   r8   r9   r   r3   s       r'   r?   zVJEPA2RopeAttention.__init__   s    	&#6 ,,1"K>"2 3,-Q0 
 $'{5H'H#I !558P8PPYY{D,>,>V__U
99[$*<*<6??SYY{D,>,>V__U
IIk;7	"??zz$"3"34..$++2H2HH++559Q9QQt771<BCD
t771<BCD
t771<BCD
//5r&   c                 P    t        | j                  | j                  z        }||z  S rG   )rW   r   )r1   idstokens_per_frames      r'   _get_frame_posz"VJEPA2RopeAttention._get_frame_pos   s&    t~~>?&&&r&   c                     t        | j                  | j                  z        }| j                  |      }|||z  z
  }| j                  }||z  S rG   )rW   r   r   )r1   r   r   	frame_idstokens_per_rows        r'   _get_height_posz#VJEPA2RopeAttention._get_height_pos   sN    t~~>?'',	$y00n$$r&   c                    |j                   }|j                  d      }|-|j                  d      j                  d| j                  d      }nt        j                  ||      }t        | j                  | j                  z        }| j                  |      }| j                  }| j                  |      }	|||z  z
  ||	z  z
  }
||	|
fS )Nr   r   )r   r   r   re   r   r!   r   rW   r   r   r   )r1   rS   masksr   
token_sizer   r   r   r   
height_ids	width_idss              r'   get_position_idsz$VJEPA2RopeAttention.get_position_ids  s    VVAY
 //!$++At/G/GKC,,z&9Ct~~>?'',	))#.
 +i77>J;VV	*i//r&   c                    |\  }}}d}t        |d||| j                  z   f   |      }|| j                  z  }t        |d||| j                  z   f   |      }|| j                  z  }t        |d||| j                  z   f   |      }	|| j                  z  }|| j                  k  r&|d|d f   }
t        j                  |||	|
gd      }|S t        j                  |||	gd      }|S )Nr   .)r   r,   r   )r   r   r   r   r   r!   cat)r1   qkpos_idsd_maskh_maskw_masksqkdqkhqkwqkrs              r'   apply_rotary_embeddingsz+VJEPA2RopeAttention.apply_rotary_embeddings  s    !($RQTZZ-?(?%@fM	TZZ$RQTZZ-?(?%@fM	TZZ$RQTZZ-?(?%@fM	TZZt'''S!"W+CCc3/R8B 	 Cc?3B	r&   position_maskoutput_attentions	head_maskrN   c           
      .   |j                   \  }}}| j                  |      j                  |d| j                  | j                        j                  dd      }| j                  |      j                  |d| j                  | j                        j                  dd      }	| j                  |      j                  |d| j                  | j                        j                  dd      }
| j                  ||      }| j                  |	|      }	| j                  ||      }t        }| j                  j                  dk7  rN| j                  j                  dk(  r|rt        j                  d       nt        | j                  j                     } || ||	|
|| j                   | j"                  | j$                  sdn| j&                  	      \  }}|j)                         d d
 | j*                  fz   }| j-                  |j/                  |            }|r||f}|S |f}|S )Nr,   r   rP   )r   eagersdpa`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.        r   rp   rq   rs   )rc   rl   viewr   r   rR   rm   rn   r   r   r   r8   _attn_implementationloggerwarning_oncer   r   rp   rv   r   r   r   rD   reshape)r1   r   r   r   r   
batch_size
seq_length_query_layer	key_layervalue_layerr   attention_interfacecontext_layerattention_probsnew_context_layer_shapeoutputss                    r'   rT   zVJEPA2RopeAttention.forward)  s    %2$7$7!
JJJ}%T*b$":":D<T<TUYq!_ 	 HH]#T*b$":":D<T<TUYq!_ 	 JJ}%T*b$":":D<T<TUYq!_ 	 '']'K00GD	22;H(?;;++w6{{//69>O##L
 '>dkk>^>^&_#)<nnLL#}}C$2C2C	*
& #0"4"4"6s";t?Q?Q>S"S		-"7"78O"PQ6G=/2 O\M]r&   )rV      rG   )NFN)r   r   r   r   rW   r?   r   r   r   r   r   r!   rY   boolr   r$   rT   r4   r5   s   @r'   r   r      s      #%	## # !	#J'%0*( 15"',06  -6  	6
 ELL)6 
uU\\5<</0%2EE	F6r&   r   input	drop_probrv   rN   c                    |dk(  s|s| S d|z
  }| j                   d   fd| j                  dz
  z  z   }|t        j                  || j                  | j
                        z   }|j                          | j                  |      |z  }|S )aF  
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
    r   r   r   r   r   )rc   ndimr!   randrb   r   floor_div)r   r   rv   	keep_probrc   random_tensorr2   s          r'   	drop_pathr   c  s     CxII[[^

Q 77E

5ELL YYMYYy!M1FMr&   c                   t     e Zd ZdZddee   f fdZdej                  dej                  fdZ	de
fdZ xZS )	VJEPA2DropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).r   c                 0    t         |           || _        y rG   )r.   r?   r   )r1   r   r3   s     r'   r?   zVJEPA2DropPath.__init__{  s    "r&   r   rN   c                 D    t        || j                  | j                        S rG   )r   r   rv   )r1   r   s     r'   rT   zVJEPA2DropPath.forward  s    FFr&   c                      d| j                    S )Nzp=)r   r1   s    r'   
extra_reprzVJEPA2DropPath.extra_repr  s    DNN#$$r&   rG   )r   r   r   r    r   floatr?   r!   rY   rT   strr   r4   r5   s   @r'   r   r   x  s@    b#(5/ #GU\\ Gell G%C %r&   r   c                   f     e Zd Zddededef fdZdej                  dej                  fdZ	 xZ
S )		VJEPA2MLPr8   r9   	mlp_ratioc                     t         |           |x}}t        ||z        }t        j                  ||d      | _        t        |j                     | _        t        j                  ||d      | _	        y NTr   )
r.   r?   rW   r   r   fc1r	   
hidden_act
activationfc2)r1   r8   r9   r   in_featuresout_featureshidden_featuresr3   s          r'   r?   zVJEPA2MLP.__init__  sa    %00lkI5699[/E !2!2399_lFr&   hidden_staterN   c                 l    | j                  |      }| j                  |      }| j                  |      }|S rG   )r  r  r  )r1   r  s     r'   rT   zVJEPA2MLP.forward  s2    xx-|4xx-r&   )rV         @)r   r   r   r   rW   r   r?   r!   rY   rT   r4   r5   s   @r'   r   r     s=    G| G# GQV GELL U\\ r&   r   c                        e Zd ZdZ	 	 	 	 ddededededef
 fdZ	 	 	 ddej                  d	e
ej                     d
e
ej                     dedeej                  df   f
dZ xZS )VJEPA2LayerzCThis corresponds to the Block class in the original implementation.r8   drop_path_rater9   r   r   c                    t         |           || _        || _        || _        || _        t        j                  ||j                        | _	        t        |||      | _        |j                  dkD  rt        |      nt        j                         | _        t        j                  ||j                        | _        t#        |||      | _        y )Nepsr   )r9   r   )r.   r?   r8   r9   r   r   r   	LayerNormlayer_norm_epsnorm1r   	attentionr  r   Identityr   norm2r   mlp)r1   r8   r  r9   r   r   r3   s         r'   r?   zVJEPA2Layer.__init__  s     	&#6 "\\+63H3HI
,V[BUV;A;P;PSV;V7\^\g\g\i\\+63H3HI
V	Rr&   r   r   r   r   rN   .c                    |}| j                  |      }| j                  ||||      }|d   }| j                  |      |z   }|}| j                  |      }| j	                  |      }| j                  |      |z   }|dd  }|f|z   }|S )N)r   r   r   r   r   )r  r  r   r  r  )	r1   r   r   r   r   residualself_attention_outputsattention_outputr   s	            r'   rT   zVJEPA2Layer.forward  s     !

=1!%'/	 "0 "
 2!4'788C !

=1/}5@ ), "W,r&   )r   rV   r   r
  )NNF)r   r   r   r    r   r   rW   r?   r!   rY   r   r   r$   rT   r4   r5   s   @r'   r  r    s    M
 !$#%SS S 	S
 !S S. 15,0"'||  - ELL)	
   
u||S 	!r&   r  c                        e Zd Zdef fdZe	 	 	 	 d	deej                     deej                     de	de	de
f
d       Z xZS )
VJEPA2Encoderr8   c                 ^   t         |           || _        t        ||j                        | _        t        |j                        D cg c]2  }|j                  dkD  r|j                  |z  |j                  dz
  z  nd4 }}t        j                  t        |j                        D cg c]3  }t        |||   |j                  |j                  |j                        5 c}      | _        t        j                  |j                  |j                         | _        d| _        y c c}w c c}w )Nr^   r   r   r  r9   r   r   r  F)r.   r?   r8   r[   r9   rj   rangenum_hidden_layersr  r   
ModuleListr  r   r   layerr  r  	layernormgradient_checkpointingr1   r8   idrop_path_ratesr3   s       r'   r?   zVJEPA2Encoder.__init__  s   *6v?Q?QR 6334
 LRKcKcfgKgV""Q&&*B*BQ*FGmpp
 
 ]] v778	  #21#5 & 2 2(.(B(B$..	

 f&8&8f>S>ST&+##

	s   
7D%)8D*rM   r   r   output_hidden_statesrN   c                    |rdnd }|rdnd }| j                  |      }t        | j                        D ]2  \  }	}
|r||fz   }|||	   nd } |
|d ||      }|d   }|s*||d   fz   }4 | j                  |      }|r||fz   }t	        |||      S )Nr%   r   r   r   r   r   )rj   	enumerater#  r$  r   )r1   rM   r   r   r)  r|   all_hidden_statesall_self_attentionsr   r'  layer_modulelayer_head_masklayer_outputss                r'   rT   zVJEPA2Encoder.forward  s     #7BD$5b4(;<(4 		POA|#$58H$H!.7.CilO(oO`aM)!,M &9]1=M<O&O#		P }5 1]4D D++*
 	
r&   )NNFF)r   r   r   r   r?   r   r   r!   rY   r   r   rT   r4   r5   s   @r'   r  r    ss    ,| ,0  7;,0"'%*!
%ell3!
 ELL)!
  	!

 #!
 
!
 !
r&   r  tensorr   c                    g }|D ]j  }|j                  | j                        }|j                  d      j                  dd| j	                  d            }|t        j                  | d|      gz  }l t        j                  |d      S )z
    Args:
        tensor (`torch.Tensor`):
            Tensor of shape [batch_size, num_patches, feature_dim]
        masks (`List[torch.Tensor]`):
            List of tensors of shape [batch_size, num_patches] containing indices of patches to keep
    r,   r   rt   indexr   r   )rg   r   r   re   r   r!   gatherr   )r2  r   all_masked_tensorsmask	mask_keeps        r'   apply_masksr:    s      Mwwv}}%NN2&--aFKKOD	u||FKLLM
 99'Q//r&   c                        e Zd ZdZdef fdZed        Z	 ddej                  de
ej                     de
ej                     ded	eej                  ej                  f   f
d
Z xZS )VJEPA2PredictorEmbeddingsr\   r8   c                    t         |           || _        t        j                  |j
                  |j                        | _        d| _        |j                  | _
        |j                  | _        t        j                  t        j                  | j                  dd|j                              | _        |j                   | _        || _        y )Nr   r   )r.   r?   r8   r   r   r9   pred_hidden_sizepredictor_embeddingsnum_mask_tokenspred_zero_init_mask_tokenszero_init_mask_tokenspred_num_mask_tokens	Parameterr!   zerosmask_tokensr@   r1   r8   r3   s     r'   r?   z"VJEPA2PredictorEmbeddings.__init__!  s    $&IIf.@.@&BYBY$Z! %+%F%F"%::<<D4H4H!QPVPgPg(hi ++r&   c                     | j                   dkD  rM| j                   | j                  z  | j                  | j                  z  z  | j                  | j                  z  z  S | j                  | j                  z  | j                  | j                  z  z  S Nr   rH   rK   s    r'   rL   z%VJEPA2PredictorEmbeddings.num_patches.  s    !!A%''6+>+>>##v'8'88:##v'8'88: $$(9(99f>N>NRXRcRc>cddr&   r   context_masktarget_mask
mask_indexrN   c                    |j                  d      }| j                  |      }|| j                  z  }| j                  |   }|d   j	                         dz   }|j                  ||d      }t        ||      }|j                  t        |      dd      }t        j                  ||gd      }	t        j                  |d      }
t        j                  |d      }t        j                  |
|gd      }|	|fS )z
        hidden_states : encoder outputs (context)
        context_mask: tokens of the context (outputs from the encoder)
        target_mask: tokens to predict
        mask_index: index of the target mask to choose (useful for multiclip?)
        r   r   r   )
r   r?  r@  rF  maxre   r:  lenr!   r   )r1   r   rJ  rK  rL  r   contexttargetmax_patch_numrj   cmtmr   s                r'   rT   z!VJEPA2PredictorEmbeddings.forward9  s     q!++M:  $"6"66
!!*- $A**,q0q-3V[1 ..\!2Aq9YY0a8
 YY|+YY{*		2r(*5  r&   r   )r   r   r   r    r   r?   rX   rL   r!   rY   r-   rW   r$   rT   r4   r5   s   @r'   r<  r<    s    |  e e &!||&! 5<<(&! %,,'	&!
 &! 
u||U\\)	*&!r&   r<  c                        e Zd Zdef fdZddZd Ze	 	 	 ddej                  de
ej                     de
ej                     deej                     d	ed
edefd       Z xZS )VJEPA2Predictorr8   c                    t         |           || _        d| _        t	        |      | _        t        |j                        D cg c]2  }|j                  dkD  r|j                  |z  |j                  dz
  z  nd4 }}t        j                  t        |j                        D cg c]3  }t        |||   |j                  |j                  |j                        5 c}      | _        t        j                   |j                  |j"                        | _        t        j&                  |j                  |j(                  d      | _        y c c}w c c}w )NFr   r   r  r  Tr   )r.   r?   r8   r%  r<  rj   r   pred_num_hidden_layersr  r   r"  r  r>  pred_num_attention_headspred_mlp_ratior#  r  r  r$  r   r9   rD   r&  s       r'   r?   zVJEPA2Predictor.__init__c  s5   &+#3F; 6889
  0014 %%)V-J-JQ-NO
 
 ]] v<<=	  #21#5 & 7 7(.(G(G$33	

 f&=&=6CXCXYIIf55v7I7IPTU	+
	s   7E
$8Ec           	         |j                  |j                        }t        j                  |d|      }|j                  |j                        }|j	                  d      j                  dd|j                  d            }t        j                  |d|      }|k|d   e|j                  |j                        }|j                  ddddd      }|j	                  d      j	                  d      j                  d|j                  d      |j                  d      d      j	                  d      j                  dddd|j                  d            }t        j                  |d|      }|j	                  d      j	                  d      j	                  d      j                  d|j                  d      |j                  d      |j                  d      d      }t        j                  |d|      }|j                  ddddd      }|||fS )Nr   r4  r,   r   rP   r   ra   )rg   r   r!   r6  r   expandr   rd   )r1   r   position_masksargsortr   hidden_states_argsort
argsort_4d
argsort_5ds           r'   sort_tokenszVJEPA2Predictor.sort_tokens  s   **^223n!7K **]112 ' 1 1" 5 < <R]EWEWXZE[ \]AVW  Yq\%=jj!1!12G!))!Q1a8I!!!$1INN1-y~~a/@"E2BB	r(:;  YAZHI!!!$11INN1-y~~a/@)..QRBSUWX	  YAZHI!))!Q1a8Ini77r&   c                     |j                  |j                        }t        j                  |d      }|j	                  d      j                  dd|j                  d            }t        j                  |d|      }|S )Nr   r   r,   r4  )rg   r   r!   r^  r   r\  r   r6  )r1   r   r^  reverse_argsorts       r'   unsort_tokenszVJEPA2Predictor.unsort_tokens  si    **]112--Q7)33B7>>r2}GYGYZ\G]^]Qr&   encoder_hidden_statesrJ  rK  r   r   r)  rN   c                    |rdnd }|rdnd }	t        ||      }|j                  \  }
}}| j                  |||      \  }}t        j                  |d      }| j                  ||||      \  }}}t        | j                        D ]2  \  }}|r||fz   }|||   nd } |||||      }|d   }|s*|	|d   fz   }	4 |r||fz   }| j                  |      }| j                  ||      }|d d |d f   }| j                  |      }t        |||	      S )Nr%   r   r   r   r+  )r:  rc   rj   r!   r^  rb  r,  r#  r$  re  rD   r   )r1   rf  rJ  rK  r   r   r)  r|   r-  r.  r   N_ctxtr   r   r]  r^  r'  r/  r0  r1  s                       r'   rT   zVJEPA2Predictor.forward  s[    #7BD$5b4 !,,A< P,2261(,8M|]h(i%~ --A6373C3CMSacjlu3v0~y(4 		POA|#$58H$H!.7.CilO(YjkM)!,M &9]1=M<O&O#		P   1]4D D}5**='B%aj1		-0++*
 	
r&   rG   )NFF)r   r   r   r   r?   rb  re  r   r!   rY   r-   r   r   r   rT   r4   r5   s   @r'   rV  rV  b  s    V| V88B  -1"'%*0
$||0
 5<<(0
 %,,'	0

 ELL)0
  0
 #0
 
0
 0
r&   rV  c                        e Zd ZdZdef fdZ	 	 d	dej                  deej                     dee	   de
ej                  eej                     f   fdZ xZS )
VJEPA2PoolerSelfAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr8   c                    t         |           || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        | j                  | j                  z  | j                  k7  r&t        d| j                   d| j                   d      | j                  dz  | _	        |j                  | _        d| _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        y Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).r   F)r.   r?   r8   r9   	embed_dimr   r   head_dimr   scaleattention_dropoutrq   r   r   r   k_projv_projq_projout_projrG  s     r'   r?   z"VJEPA2PoolerSelfAttention.__init__  s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..Ar&   r   ro   r   rN   c           
         |j                   \  }}}| j                  |      }| j                  |      }| j                  |      }	|j	                  ||| j
                  | j                        j                  dd      }|j	                  ||| j
                  | j                        j                  dd      }|	j	                  ||| j
                  | j                        j                  dd      }	t        }
| j                  j                  dk7  rN| j                  j                  dk(  r|rt        j                  d       nt        | j                  j                     }
 |
| |||	|| j                  | j                  | j                   sdn| j"                        \  }}|j%                  |||      j'                         }| j)                  |      }|sd}||fS 	z#Input shape: Batch x Time x Channelr   rP   r   r   r   r   r   N)rc   rs  rq  rr  r   r   rn  rR   r   r8   r   r   r   r   r   ro  rv   rq   r   r{   rt  )r1   r   ro   r   r   r   rm  querieskeysvaluesr   r~   r}   s                r'   rT   z!VJEPA2PoolerSelfAttention.forward  s    -:,?,?)
J	++m,{{=)]+,,z:t~~t}}U__`acdeyyZOYYZ[]^_ZT^^T]]S]]^_abc(?;;++w6{{//69>O##L
 '>dkk>^>^&_#$7nnJJ#}}C$,,	%
!\ "))*j)LWWYmmK0 LL((r&   NFr   r   r   r    r   r?   r!   rY   r   r   r$   rT   r4   r5   s   @r'   rj  rj    so    GB| B. 26,1	-)||-) !.-) $D>	-)
 
u||Xell33	4-)r&   rj  c                        e Zd ZdZdef fdZ	 	 ddej                  dej                  dej                  deej                     dee	   d	e
ej                  eej                     f   fd
Z xZS )VJEPA2PoolerCrossAttentionz_It's different from other cross-attention layers, doesn't have output projection layer (o_proj)r8   c                    t         |           || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        | j                  | j                  z  | j                  k7  r&t        d| j                   d| j                   d      | j                  dz  | _	        |j                  | _        d| _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        y rl  )r.   r?   r8   r9   rm  r   r   rn  r   ro  rp  rq   r   r   r   rq  rr  rs  rG  s     r'   r?   z#VJEPA2PoolerCrossAttention.__init__'  s    ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?r&   rw  rx  ry  ro   r   rN   c           
         |j                   \  }}}|j                   d   }	| j                  |      }| j                  |      }| j                  |      }|j	                  ||| j
                  | j                        j                  dd      }|j	                  ||	| j
                  | j                        j                  dd      }|j	                  ||	| j
                  | j                        j                  dd      }t        }
| j                  j                  dk7  rN| j                  j                  dk(  r|rt        j                  d       nt        | j                  j                     }
 |
| ||||| j                  | j                  | j                   sdn| j"                        \  }}|j%                  |||      j'                         }|sd}||fS rv  )rc   rs  rq  rr  r   r   rn  rR   r   r8   r   r   r   r   r   ro  rv   rq   r   r{   )r1   rw  rx  ry  ro   r   r   q_seq_lengthrm  kv_seq_lengthr   r~   r}   s                r'   rT   z"VJEPA2PoolerCrossAttention.forward:  s    /6mm+
L)

1++g&{{4 V$,,z<Waabcefgyy]DNNDMMR\\]^`abZV``abdef(?;;++w6{{//69>O##L
 '>dkk>^>^&_#$7nnJJ#}}C$,,	%
!\ "))*lINYY[ LL((r&   rz  r{  r5   s   @r'   r}  r}  "  s    i@| @0 26,1/)/) ll/) 	/)
 !./) $D>/) 
u||Xell33	4/)r&   r}  c                        e Zd Zdef fdZ	 d	dej                  dej                  dee   de	ej                  df   fdZ
 xZS )
VJEPA2PoolerSelfAttentionLayerr8   c                 :   t         |           t        j                  |j                  |j
                        | _        t        |      | _        t        j                  |j                  |j
                        | _	        t        ||j                        | _        y Nr  r^   )r.   r?   r   r  r9   r  layer_norm1rj  	self_attnlayer_norm2r   r  rG  s     r'   r?   z'VJEPA2PoolerSelfAttentionLayer.__init__n  sl    <<(:(:@U@UV26:<<(:(:@U@UVV1C1CDr&   r   ro   r   rN   .c                     |}| j                  |      }| j                  |||      \  }}||z   }|}| j                  |      }| j                  |      }||z   }|f}|r||fz  }|S )a=  
        Args:
            hidden_states (`torch.FloatTensor`):
                Input to the layer of shape `(batch, seq_len, embed_dim)`.
            attention_mask (`torch.FloatTensor`):
                Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
            output_attentions (`bool`, *optional*, defaults to `False`):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )r   ro   r   )r  r  r  r  )r1   r   ro   r   r  r}   r   s          r'   rT   z&VJEPA2PoolerSelfAttentionLayer.forwardu  s      !((7&*nn')/ '5 '
#|
 !=0 ((7/ =0 "&Gr&   )Fr   r   r   r   r?   r!   rY   r   r   r$   rT   r4   r5   s   @r'   r  r  m  s[    E| E -2	#||# # $D>	#
 
u||S 	!#r&   r  c                        e Zd Zdef fdZ	 	 d
dej                  dej                  deej                     dede	ej                  df   f
d	Z
 xZS )VJEPA2PoolerCrossAttentionLayerr8   c                 :   t         |           t        j                  |j                  |j
                        | _        t        |      | _        t        j                  |j                  |j
                        | _	        t        ||j                        | _        y r  )r.   r?   r   r  r9   r  r  r}  
cross_attnr  r   r  rG  s     r'   r?   z(VJEPA2PoolerCrossAttentionLayer.__init__  sl    <<(:(:@U@UV4V<<<(:(:@U@UVV1C1CDr&   rw  r  ro   r   rN   .c                     |}| j                  |      }| j                  |||||      ^}}||z   }|}| j                  |      }| j                  |      }||z   }|f}|r|t	        |      z  }|S )N)ro   r   )r  r  r  r  r$   )r1   rw  r  ro   r   r  r}   r   s           r'   rT   z'VJEPA2PoolerCrossAttentionLayer.forward  s     ''5&*oo)/ '6 '
#|  ,.  ''5xx-,./u\**Gr&   rz  r  r5   s   @r'   r  r    sm    E| E 26"' ll !.	
   
u||S 	!r&   r  c                   `     e Zd ZdZdef fdZdej                  dej                  fdZ xZ	S )VJEPA2AttentivePoolerzAttentive Poolerr8   c                 F   t         |           t        j                  t	        j
                  dd|j                              | _        t        |      | _	        t        j                  t        |j                        D cg c]  }t        |       c}      | _        y c c}w rI  )r.   r?   r   rD  r!   rE  r9   query_tokensr  cross_attention_layerr"  r   num_pooler_layersr  self_attention_layers)r1   r8   r   r3   s      r'   r?   zVJEPA2AttentivePooler.__init__  sr    LLQ6;M;M)NO%DV%L"%']]=B6C[C[=\]+F3]&
"]s   ?Br  rN   c                     | j                   D ]  } ||d       d   } | j                  j                  |j                  d   dd      }| j	                  ||      d   }|j                  d      S )N)ro   r   r   )r  r  re   rc   r  r   )r1   r  r#  rw  s       r'   rT   zVJEPA2AttentivePooler.forward  sw    // 	GE dCAFL	G##**<+=+=a+@!QG11'<HK##A&&r&   )
r   r   r   r    r   r?   r!   rY   rT   r4   r5   s   @r'   r  r    s-    
| 
'ELL 'U\\ 'r&   r  c                   :    e Zd ZU eed<   dZdZdZg dZdZ	dZ
d Zy)VJEPA2PreTrainedModelr8   vjepa2rM   T)r  r  r  r<  c                 ~   | j                   j                  }d }t        |t              r ||j                  |       t        |j                  d      D ]]  \  }}||dz  z  } ||j                  j                  j                  |        ||j                  j                  j                  |       _ |t        |j                        dz   dz  z  } ||j                  j                  j                  j                  |       yt        |t              rF|j                  r%|j                   j"                  j%                          y ||j                   |       yt        |t&        j(                  t&        j*                  t&        j,                  f      rF ||j                  |       |j.                  %|j.                  j"                  j%                          yyt        |t&        j0                        rJ|j.                  j"                  j%                          |j                  j"                  j3                  d       yy)zInitialize the weightsc                     | j                   j                  t        j                        }t        j
                  j                  |d|      }|j                  | j                        | _         y )Nr   )meanstd)datarg   r!   rz   r   inittrunc_normal_rb   )rf   r  data_float_32	data_inits       r'   trunc_normal_f32_z>VJEPA2PreTrainedModel._init_weights.<locals>.trunc_normal_f32_  sG    "KKNN5==9M--m#3-OI#,,v||4FKr&   )r  r   g      ?Nr   )r8   initializer_ranger0   r  r  r,  r  r  rt  rf   r  r  rO  r  r<  rB  rF  r  zero_r   r   Conv2drB   r   r  fill_)r1   rk   init_stdr  r'  r#  r  s          r'   _init_weightsz#VJEPA2PreTrainedModel._init_weights  s    ;;00	5
 f34f11x@%f&B&BAF A5!S&)!%//":":"A"AsK!%))--"6"6C@A c&">">?!CKKCf::>>BBIIsS 9:++""''--/!&"4"4(CBIIryy ABfmm:{{&  &&( '-KK""$MM$$S) .r&   N)r   r   r   r   r#   base_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attnr  r%   r&   r'   r  r    s3     +O&*# N*r&   r  c                     | 7| j                  d      j                  d      } | j                  |dddd      } | S dg|z  } | S )z
    Inputs:
        - head_mask: bsz x seq_length x seq_length | None
    Returns
        - [num_hidden_layers x batch x num_heads x seq_length x seq_length] | [num_hidden_layers]
    Nr   r   r,   )r   r\  )r   r!  s     r'   _convert_head_mask_to_5dr    s[     ''*44Q7	$$%6BBG	  F..	r&   c                   4    e Zd Zdef fdZdefdZee	 	 	 	 	 	 	 dde	j                  dee	j                     deee	j                        dee	j                     d	eee	j                        d
edee   dee   defd              Zde	j                  fdZ xZS )VJEPA2Modelr8   c                     t         |   |       || _        t        |      | _        t        |      | _        | j                          y rG   )r.   r?   r8   r  encoderrV  	predictor	post_initrG  s     r'   r?   zVJEPA2Model.__init__  s;     $V,(0 	r&   rN   c                 B    | j                   j                  j                  S rG   )r  rj   r_   r   s    r'   get_input_embeddingsz VJEPA2Model.get_input_embeddings"  s    ||&&777r&   rM   context_head_maskrJ  target_head_maskrK  skip_predictorr   r)  c	                    ||n| j                   j                  }||n| j                   j                  }|t        d      t	        || j                   j
                        }t	        || j                   j                        }| j                  ||||      }
|
j                  }|||j                  d      }|j                  d      }t        j                  ||j                        j                  d      j                  |df      g}t        j                  ||j                        j                  d      j                  |df      g}|sO| j                  ||||||      }t!        |j                  t#        ||      |j$                  |j&                        }nd}t)        |t#        ||      |
j$                  |
j&                  |	      }|S )
aL  
        context_head_mask (`torch.Tensor` with shape `[num_heads]` or `[num_hidden_layers x num_heads]`, *optional*):
            The mask indicating if we should keep the heads or not (1.0 for keep, 0.0 for discard) for the context.
        context_mask (`torch.Tensor` with shape `[batch_size, patch_size, 1]`, *optional*):
            The mask position ids indicating which encoder output patches are going to be exposed to the predictor.
            By default, this mask is created as torch.arange(N).unsqueeze(0).repeat(B,1), indicating full context
            available to the predictor.
        target_head_mask (`torch.Tensor` with shape `[num_heads]` or `[num_hidden_layers x num_heads]`, *optional*):
            The mask indicating if we should keep the heads or not (1.0 for keep, 0.0 for discard) for the target.
        target_mask (`torch.Tensor` with shape `[batch_size, patch_size, 1]`, *optional*):
            The mask position ids indicating which encoder output patches are going to be used as a prediction target
            for the predictor. By default, this mask is created as torch.arange(N).unsqueeze(0).repeat(B,1), indicating
            that the predictor should predict all encoder patches.
        skip_predictor (bool):
            flag to skip the predictor forward, useful if you just need the encoder outputs
        Nz'You have to specify pixel_values_videos)rM   r   r   r)  r   r   r   )rf  rJ  rK  r   r   r)  )r   r   r   r   )r   r   r   r   r*   )r8   r   r)  r   r  r!  rX  r  r   r   r!   r   r   r   re   r  r   r:  r   r   r)   )r1   rM   r  rJ  r  rK  r  r   r)  r|   encoder_outputssequence_outputr   r   predictor_outputsr*   encoder_outputs                    r'   rT   zVJEPA2Model.forward%  s   < 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &FGG 55FHeHef34DdkkFhFhi+/<< 3'/!5	 ,8 ,
 *;;K$7#((+A$$Q'A!LL3F3M3MNXXYZ[bbdeghcijkL <<2E2L2LMWWXYZaacdfgbhijK15&5)'*"3%9 2@ 2  D"3"E"E$/$M/==,77	   $9- +O\ J)77&11-
 r&   c                 <    | j                  |      }|j                  S rG   )rT   r   )r1   rM   r  s      r'   get_vision_featureszVJEPA2Model.get_vision_featuresy  s    &9:///r&   )NNNNFNN)r   r   r   r   r?   r7   r  r   r   r!   rY   r   r-   r   r)   rT   r  r4   r5   s   @r'   r  r    s    | 8&= 8  59593748$,0/3P"\\P $ELL1P tELL12	P
 #5<<0P d5<<01P P $D>P 'tnP 
*P  Pd0%,, 0r&   r  z}
    V-JEPA 2 Model transformer with a video classification head on top (a linear layer on top of the attentive pooler).
    c                        e Zd Zdef fdZee	 	 	 d	dej                  de	ej                     de	e
   de	e
   deeef   f
d              Z xZS )
VJEPA2ForVideoClassificationr8   c                    t         |   |       |j                  | _        t        |      | _        t        |      | _        t        j                  |j                  |j                  d      | _
        | j                          y r   )r.   r?   
num_labelsr  r  r  poolerr   r   r9   
classifierr  rG  s     r'   r?   z%VJEPA2ForVideoClassification.__init__  sd      ++!&) ,F3))F$6$68I8IPTU 	r&   rM   labelsr   r)  rN   c                    | j                  |d||      }|j                  }| j                  |      }| j                  |      }d}	|| j	                  ||| j
                        }	t        |	||j                  |j                        S )ag  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Examples:

        ```python
        >>> import torch
        >>> import numpy as np
        >>> from transformers import AutoVideoProcessor, VJEPA2ForVideoClassification

        >>> device = "cuda"

        >>> video_processor = AutoVideoProcessor.from_pretrained("facebook/vjepa2-vitl-fpc16-256-ssv2")
        >>> model = VJEPA2ForVideoClassification.from_pretrained("facebook/vjepa2-vitl-fpc16-256-ssv2").to(device)

        >>> video = np.ones((64, 256, 256, 3))  # 64 frames, 256x256 RGB
        >>> inputs = video_processor(video, return_tensors="pt").to(device)

        >>> # For inference
        >>> with torch.no_grad():
        ...     outputs = model(**inputs)
        >>> logits = outputs.logits

        >>> predicted_label = logits.argmax(-1).item()
        >>> print(model.config.id2label[predicted_label])

        >>> # For training
        >>> labels = torch.ones(1, dtype=torch.long, device=device)
        >>> loss = model(**inputs, labels=labels).loss

        ```T)rM   r  r   r)  N)pooled_logitsr  r8   )losslogitsr   r   )	r  r   r  r  loss_functionr8   r   r   r   )
r1   rM   r  r   r)  r   r   pooler_outputr  r  s
             r'   rT   z$VJEPA2ForVideoClassification.forward  s    X ++ 3/!5	  
 $55$56/%%F6RVR]R]%^D$!//))	
 	
r&   )NNN)r   r   r   r   r?   r   r   r!   rY   r   r   r   r$   r   rT   r4   r5   s   @r'   r  r  ~  s    |   *.,0/3>
"\\>
 &>
 $D>	>

 'tn>
 
u++	,>
  >
r&   r  )r  r  r  )r   )r   F):dataclassesr   typingr   r   r   r!   r   activationsr	   modeling_layersr
   modeling_outputsr   r   modeling_utilsr   r   utilsr   r   r   r   configuration_vjepa2r   
get_loggerr   r   r   r)   Moduler7   r[   rY   r   r   r   r   r   r   r   r   r  r  r-   r:  r<  rV  rj  r}  r  r  r  r  r  r  r  __all__r%   r&   r'   <module>r     s   " , ,   ! 9 F F K K . 
		H	% 
<; < < {  * bii  Fryy T %II%<<% 
% <<	%
 U\\*% % %<)6O")) OfU\\ e T V[VbVb *%RYY %		  4, 4n;
BII ;
|0 0T%,,-? 0ELL 0"C!		 C!Lv
bii v
rD)		 D)NG) G)V+%? +\%&@ %P'BII '& -*O -* -*` d0' d0 d0N 
N
#8 N

N
b Sr&   