
    rhu                        d dl Z d dlmZ d dlmZmZmZ d dlZd dlm	Z	 d dl
m	c mZ d dlmZ ddlmZ ddlmZmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZ ddlm Z m!Z!m"Z"m#Z# ddl$m%Z%m&Z& ddl'm(Z(m)Z) ddl*m+Z+ ddl,m-Z-m.Z.m/Z/m0Z0 ddl1m2Z2 ddl3m4Z4m5Z5  e0jl                  e7      Z8 G d de	jr                        Z: G d de	jr                        Z; G d dej                  jr                        Z< G d de	jr                        Z= G d de	j|                        Z? ed       G d  de	jr                               Z@ G d! d"e	jr                        ZAd#ej                  d$ej                  d%ej                  d&eCej                  ej                  f   fd'ZDd(ej                  d)eEd&ej                  fd*ZF	 d]d+e	jr                  d,ej                  d-ej                  d.ej                  d/eej                     d0eGd1eGfd2ZH	 d]d+e	jr                  d,ej                  d-ej                  d.ej                  d/eej                     d0eGd1eGfd3ZI G d4 d5e	jr                        ZJ G d6 d7e      ZKe. G d8 d9e)             ZLe. G d: d;eL             ZM G d< d=eLe      ZNe e.d>?       G d@ dAe#                    ZO G dB dCej                  jr                        ZP G dD dEe	jr                        ZQdF ZR G dG dHe	jr                        ZSdIej                  d,ej                  fdJZTd,ej                  d-ej                  dIej                  d&eCej                  ej                  f   fdKZU G dL dMe	jr                        ZV G dN dOe	jr                        ZW G dP dQe      ZX G dR dSe	jr                        ZY G dT dUe	jr                        ZZ G dV dWe	jr                        Z[ G dX dYeL      Z\ G dZ d[eLe      Z]g d\Z^y)^    N)	dataclass)CallableOptionalUnion)Llama4VisionConfig   )ACT2FN)CacheDynamicCache)GenerationMixin)use_kernel_forward_from_hub)create_causal_maskcreate_chunked_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPastCausalLMOutputWithPastModelOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)check_model_inputs   )Llama4ConfigLlama4TextConfigc                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )Llama4TextExpertsconfigc                    t         |           |j                  | _        |j                  | _        |j
                  | _        | j                  | _        t        j                  t        j                  | j                  | j
                  d| j                  z              | _        t        j                  t        j                  | j                  | j                  | j
                  f            | _        t        |j                     | _        y N   )super__init__num_local_expertsnum_expertsintermediate_sizehidden_size
expert_dimnn	Parametertorchemptygate_up_proj	down_projr	   
hidden_actact_fnselfr%   	__class__s     }/var/www/html/ai-insurance-compliance-backend/venv/lib/python3.12/site-packages/transformers/models/llama4/modeling_llama4.pyr*   zLlama4TextExperts.__init__.   s    !33!'!9!9!--00LLT5E5EtGWGWYZ]a]l]lYl)mnekk43C3CT__VZVfVf2g&hiV../    hidden_statesreturnc                 v   |j                  | j                  j                  d   d| j                        }t	        j
                  || j                        }|j                  dd      \  }}t	        j
                  || j                  |      z  | j                        }|j                  d| j                        }|S )a2  
        This should really not be run on a single machine, as we are reaching compute bound:
        - the inputs are expected to be "sorted" per expert already.
        - the weights are viewed with another dim, to match num_expert, 1, shape * num_tokens, shape

        Args:
            hidden_states (torch.Tensor): (batch_size * token_num, hidden_size)
            selected_experts (torch.Tensor): (batch_size * token_num, top_k)
            routing_weights (torch.Tensor): (batch_size * token_num, top_k)
        Returns:
            torch.Tensor
        r   r(   dim)	viewr4   shaper.   r2   bmmchunkr7   r5   )r9   r=   gate_upgateupnext_statess         r;   forwardzLlama4TextExperts.forward8   s     &**4+<+<+B+B1+Er4K[K[\))M4+<+<====+biidkk$&7!7$..I!&&r4+;+;<r<   )	__name__
__module____qualname__r"   r*   r2   TensorrK   __classcell__r:   s   @r;   r$   r$   -   s+    0/ 0U\\ ell r<   r$   c                   &     e Zd Zd fd	Zd Z xZS )Llama4TextMLPc                 f   t         |           ||j                  }|| _        t	        j
                  |j                  |d      | _        t	        j
                  |j                  |d      | _        t	        j
                  ||j                  d      | _	        t        |j                     | _        y NFbias)r)   r*   r-   r%   r0   Linearr.   	gate_projup_projr5   r	   r6   activation_fn)r9   r%   r-   r:   s      r;   r*   zLlama4TextMLP.__init__O   s    $ & 8 86#5#57HuUyy!3!35FUS#4f6H6HuU#F$5$56r<   c                     | j                  | j                  |            | j                  |      z  }| j                  |      S N)r[   rY   rZ   r5   )r9   xr5   s      r;   rK   zLlama4TextMLP.forward[   s7    &&t~~a'89DLLOK	~~i((r<   r]   rL   rM   rN   r*   rK   rP   rQ   s   @r;   rS   rS   N   s    
7)r<   rS   c                   8     e Zd Zddef fdZd Zd Zd Z xZS )Llama4TextL2Normepsc                 0    t         |           || _        y r]   )r)   r*   rb   )r9   rb   r:   s     r;   r*   zLlama4TextL2Norm.__init__a   s    r<   c                     |t        j                  |j                  d      j                  dd      | j                  z         z  S Nr(   r@   T)keepdimr2   rsqrtpowmeanrb   r9   r^   s     r;   _normzLlama4TextL2Norm._norme   4    5;;quuQx}}R}>IJJJr<   c                 ^    | j                  |j                               j                  |      S r]   )rl   floattype_asrk   s     r;   rK   zLlama4TextL2Norm.forwardh   s"    zz!'')$,,Q//r<   c                      d| j                    S )Nzeps=rb   r9   s    r;   
extra_reprzLlama4TextL2Norm.extra_reprk   s    dhhZ  r<   )gư>)	rL   rM   rN   ro   r*   rl   rK   rt   rP   rQ   s   @r;   ra   ra   `   s    E K0!r<   ra   c                   2     e Zd Zd fd	Zd Zd Zd Z xZS )Llama4TextRMSNormc                     t         |           || _        t        j                  t        j                  |            | _        y)z<
        Llama4RMSNorm is equivalent to T5LayerNorm
        N)r)   r*   rb   r0   r1   r2   onesweight)r9   r.   rb   r:   s      r;   r*   zLlama4TextRMSNorm.__init__p   s0     	ll5::k#:;r<   c                     |t        j                  |j                  d      j                  dd      | j                  z         z  S re   rg   rk   s     r;   rl   zLlama4TextRMSNorm._normx   rm   r<   c                 |    | j                  |j                               j                  |      }|| j                  z  S r]   )rl   ro   rp   ry   )r9   r^   outputs      r;   rK   zLlama4TextRMSNorm.forward{   s0    AGGI&..q1##r<   c                 ^    t        | j                  j                         d| j                   S )Nz, eps=)tuplery   rD   rb   rs   s    r;   rt   zLlama4TextRMSNorm.extra_repr   s'    ))*+6$((<<r<   )gh㈵>)rL   rM   rN   r*   rl   rK   rt   rP   rQ   s   @r;   rv   rv   o   s    <K$=r<   rv   c                   (     e Zd Z fdZ fdZ xZS )Llama4Routerc                     t         |   |j                  |j                  d       |j                  | _        |j
                  | _        y rU   )r)   r*   r.   r+   r,   num_experts_per_toktop_kr8   s     r;   r*   zLlama4Router.__init__   s>    ++V-E-EER!33//
r<   c                 t   t         |   |      }t        j                  || j                  d      \  }}t        j
                  |t        d            j                  d||      }t        j                  j                  j                  |j                               j                  |j                        }||fS )Nr    rA   z-inf)r)   rK   r2   topkr   	full_likero   scatter_r0   
functionalsigmoidtodtype)r9   r=   router_logitsrouter_top_valuerouter_indicesrouter_scoresr:   s         r;   rK   zLlama4Router.forward   s    6+0::mTZZUV+W(.uV}ENNqR`brs++33M4G4G4IJMMmNaNabm++r<   r_   rQ   s   @r;   r   r      s    0
, ,r<   r   Llama4TextMoec                   $     e Zd Z fdZd Z xZS )r   c                     t         |           |j                  | _        |j                  | _        |j                  | _        t        |      | _	        t        |      | _        t        |      | _        y r]   )r)   r*   r   r   r.   
hidden_dimr+   r,   r$   expertsr   routerrS   shared_expertr8   s     r;   r*   zLlama4TextMoe.__init__   s[    //
 ,,!33(0"6**62r<   c                    |j                  d| j                        }| j                  |      \  }}|j                  |j                  d   d      }||j                  dd      z  }| j                  |      }| j                  |      }|j                  |j                  |j                  d   d|j                  d         j                  d             ||fS )Nr@   r    r   rA   )	reshaper   r   repeatrD   r   r   add_sum)r9   r=   r   r   	routed_in
routed_outouts          r;   rK   zLlama4TextMoe.forward   s    %--b$//B'+{{='A$}!(()<)<Q)?C	 5 5b! <<	\\),
  /##M$7$7$:B
@P@PQS@TUYY^_Y`aM!!r<   r_   rQ   s   @r;   r   r      s    3"r<   c                   ^     e Zd Zddef fdZ ej                         ed               Z xZ	S )Llama4TextRotaryEmbeddingr%   c                 `   t         |           |j                  dnd| _        |j                  | _        |j                  | _        || _        t        | j                     | _	        | j                  | j                  |      \  }| _
        | j                  d|d       | j                  | _        y )Nllama3defaultinv_freqF)
persistent)r)   r*   rope_scaling	rope_typemax_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr%   r   rope_init_fnattention_scalingregister_bufferr   original_inv_freq)r9   r%   devicer   r:   s       r;   r*   z"Llama4TextRotaryEmbedding.__init__   s    %+%8%8%D)"("@"@$*$B$B!/?+/+<+<T[[&+Q($(ZeD!%r<   c                    | j                   d d d d f   j                         j                  |j                  d   dd      }|d d d d d f   j                         }t	        |j
                  j                  t              r/|j
                  j                  dk7  r|j
                  j                  nd}t        j                  |d      5  |j                  |j
                        |z  j                  dd      }t        j                  t        j                  |      |      }|| j                  z  }d d d        |S # 1 sw Y   S xY w)	Nr   r@   r    mpscpuF)device_typeenabledr(   )r   ro   expandrD   
isinstancer   typestrr2   autocastr   	transposepolar	ones_liker   )r9   r^   position_idsinv_freq_expandedposition_ids_expandedr   freqs	freqs_ciss           r;   rK   z!Llama4TextRotaryEmbedding.forward   s
    !MM$4-8>>@GGHZHZ[\H]_acde ,QaZ 8 > > @'1!((--'E!((--[`J`ahhmmfk^^UC 	;&))!((36KKVVWXZ[\EEOOE$:EBI!D$:$::I	;
 	;
 s   A'D88Er]   )
rL   rM   rN   r"   r*   r2   no_gradr   rK   rP   rQ   s   @r;   r   r      s4    // / U]]_
  
r<   r   xqxkr   r>   c           	      &   t        j                   | j                         j                  g | j                  d d dd       }t        j                   |j                         j                  g |j                  d d dd       }t        j
                  ||d d d d d d d f   z        j                  d      }t        j
                  ||d d d d d d d f   z        j                  d      }|j                  |       |j                  |      fS )Nr@   r(   r   )r2   view_as_complexro   r   rD   view_as_realflattenrp   )r   r   r   xq_xk_xq_outxk_outs          r;   apply_rotary_embr      s    
 

 2
 2 2 IBHHSbM I2 Iq I
JC


 2
 2 2 IBHHSbM I2 Iq I
JCi1dA&> >?GGJFi1dA&> >?GGJF>>"v~~b111r<   r=   n_repc                     | j                   \  }}}}|dk(  r| S | dddddddddf   j                  |||||      } | j                  |||z  ||      S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r    N)rD   r   r   )r=   r   batchnum_key_value_headsslenhead_dims         r;   	repeat_kvr      so    
 2?1D1D.Ehz!!Qa"23::5BUW\^bdlmM  (;e(CT8TTr<   modulequerykeyvalueattention_maskscalingdropoutc                    t        || j                        }t        || j                        }	t        j                  ||j	                  dd            |z  }
|#|d d d d d d d |j
                  d   f   }|
|z   }
t        j                  j                  |
d      }
t        j                  j                  |
|| j                        }
t        j                  |
|	      }|j	                  dd      j                         }||
fS )Nr(   r   r@   rA   ptrainingr    )r   num_key_value_groupsr2   matmulr   rD   r0   r   softmaxr   r   
contiguousr   r   r   r   r   r   r   kwargs
key_statesvalue_statesattn_weightscausal_maskattn_outputs                r;   eager_attention_forwardr      s     3 ; ;<JUF$?$?@L<<z';';Aq'ABWLL!$Q1.D
0@0@0D.D%DE#k1==((2(>L==((6??([L,,|\:K''1-88:K$$r<   c                    t        || j                        }t        || j                        }	t        j                  ||j	                  dd            | j
                  dz  z  }
|#|d d d d d d d |j                  d   f   }|
|z   }
t        j                  j                  |
d      }
t        j                  j                  |
|| j                        }
t        j                  |
|	      }|j	                  dd      j                         }||
fS )	Nr(   r         r   r@   rA   r   r    )r   r   r2   r   r   r   rD   r0   r   r   r   r   r   r   s                r;   vision_eager_attention_forwardr      s     3 ; ;<JUF$?$?@L<<z';';Aq'ABV__VZEZZL!$Q1.D
0@0@0D.D%DE#k1==((2(>L==((6??([L,,|\:K''1-88:K$$r<   c                   2    e Zd ZdZdef fdZ	 	 ddej                  deej                  ej                  f   de	ej                     de	e
   de	ej                     d	ee   d
eej                  e	ej                     e	eej                        f   fdZ xZS )Llama4TextAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr%   c                    t         |           || _        || _        t	        |d|j
                  |j                  z        | _        |j                  | _        |j                  |j                  z  | _	        |j                  | _        | j                  dz  | _
        |j                  | _        |j                  | _        |j                  | _        |j                  | _        d| _        |j                   |   | _        t%        j&                  |j
                  |j                  | j                  z  |j(                        | _        t%        j&                  |j
                  |j                  | j                  z  |j(                        | _        t%        j&                  |j
                  |j                  | j                  z  |j(                        | _        t%        j&                  |j                  | j                  z  |j
                  |j(                        | _        | j                  j2                  r(| j"                  rt5        |j6                        | _        y y y )Nr   r   TrV   )r)   r*   r%   	layer_idxgetattrr.   num_attention_headsr   r   r   r   
attn_scalefloor_scaleattn_temperature_tuningattention_dropout	is_causalno_rope_layersuse_roper0   rX   attention_biasq_projk_projv_projo_projuse_qk_normra   rms_norm_epsqk_normr9   r%   r   r:   s      r;   r*   zLlama4TextAttention.__init__  s   "
F4F4F&JdJd4de#)#=#= $*$>$>&B\B\$\!#)#=#= }}d* ++!--'-'E'E$!'!9!9--i8ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii&&68J8JQWQfQf
 ;;""t}}+F,?,?@DL (5"r<   r=   position_embeddingsr   past_key_valuecache_positionr   r>   c                 j   |j                   d d }g |d| j                  }| j                  |      j                  |      }	 | j	                  |      j                  g |d| j                   }
| j                  |      j                  |      j                  dd      }| j                  r)t        |	|
|j                  |	j                              \  }	}
t        | d      r"| j                  |	      }	| j                  |
      }
| j                  r| j                  st        j                  t        j                   |j#                         dz   | j$                  z        dz         | j&                  z  dz   }|j                  d|d   ddf      j)                  g |dd      }|	|z  j                  |	j*                        }	|	j                  dd      }	|
j                  dd      }
|%d|i}|j-                  |
|| j.                  |      \  }
}t0        }| j2                  j4                  dk7  rt6        | j2                  j4                     } || |	|
||f| j8                  sdn| j:                  | j<                  d	|\  }} |j>                  g |d jA                         }| jC                  |      }||fS )
Nr@   r    r(   r        ?r  eager        )r   r   )"rD   r   r   rC   r   r   r   r   r   r   r   hasattrr  r   r2   logfloorro   r   r   r   r   updater   r   r%   _attn_implementationr   r   r   r   r   r   r  )r9   r=   r  r   r  r  r   input_shapehidden_shapequery_statesr   r   attn_scalescache_kwargsattention_interfacer   r   s                    r;   rK   zLlama4TextAttention.forward5  s    $))#2.88b8$--8{{=166|D4T[[/44UkU2Ut}}U
{{=166|DNNqRST=='7j*=*@*@ATAT*U($L* 4#<<5Lj1J ''		%++~';';'='CtGWGW&WX[^^_bfbqbqqtww  &**A{21+EFMMNbP[Nb]^Nb`aNbcK(;6::<;M;MNL#--a3))!Q/
%,n=L'5'<'<ZW[WeWegs't$J(?;;++w6"9$++:Z:Z"[$7	%
  $}}C$2H2HLL	%
 	%
!\ *k));;;;FFHkk+.L((r<   NN)rL   rM   rN   __doc__r"   r*   r2   rO   r~   r   r
   
LongTensorr   r   rK   rP   rQ   s   @r;   r   r     s    GA/ AF +/599)||9) #5<<#=>9) !.	9)
 !9) !!1!129) -.9) 
u||Xell3XeELL>Q5RR	S9)r<   r   c                   t    e Zd Z fdZ	 	 	 	 	 	 ddej
                  deej
                     deej                     deeej
                        dee	   deej                     deeej
                  ej
                  f      d	e
e   d
eej                  eeej                  ej                  f      f   fdZ xZS )Llama4TextDecoderLayerc                    t         |           |j                  | _        || _        |j                  |   | _        t        ||      | _        ||j                  v | _	        | j                  rt        |      | _        nt        ||j                        | _        t        |j                  |j                        | _        t        |j                  |j                        | _        y )N)r-   rr   )r)   r*   r.   r   layer_typesattention_typer   	self_attn
moe_layersis_moe_layerr   feed_forwardrS   intermediate_size_mlprv   r  input_layernormpost_attention_layernormr  s      r;   r*   zLlama4TextDecoderLayer.__init__r  s    !--"$00;,VY?%):):: -f 5D -fHdHd eD01C1CI\I\](9&:L:LRXReRe(f%r<   r=   r   r   r  	use_cacher  r  r   r>   c           
         |}	| j                  |      } | j                  d||||||d|\  }
}|	|
z   }|}	| j                  |      }| j                  |      }| j                  r|\  }}|	|j                  |	j                        z   }|S )N)r=   r  r   r  r'  r   )r%  r   r&  r#  r"  rC   rD   )r9   r=   r   r   r  r'  r  r  r   residualattention_states_s               r;   rK   zLlama4TextDecoderLayer.forward  s     !,,]; -dnn 
' 3)))
 
! !#33 !55mD))-8,M1 =#5#5hnn#EEr<   )NNNFNN)rL   rM   rN   r*   r2   rO   r   r  r~   boolr   r   FloatTensorrK   rP   rQ   s   @r;   r  r  q  s    g$ 26378<$)59KO"||" !." u//0	"
 !u||!45" D>" !!1!12" &eELL%,,,F&GH" -." 
u  (51B1BEDUDU1U+V"WW	X"r<   r  c                   <    e Zd ZU eed<   dZdgZdZdZdZ	dZ
dZd Zy)Llama4PreTrainedModelr%   Tpast_key_valuesFc                 V   t        | j                  d      r| j                  j                  n| j                  j                  j                  }t	        |t
        j                        rY|j                  j                  j                  d|       |j                  %|j                  j                  j                          y y t	        |t
        j                        rf|j                  j                  j                  d|       |j                  2|j                  j                  |j                     j                          y y t	        |t
        j                        rJ|j                  j                  j                  d       |j                  j                  j                          y t	        |t               r&|j                  j                  j                  d       y t	        |t"              rO|j$                  j                  j                  d|       |j&                  j                  j                  d|       y t	        |t(              ra|j*                  j                  j                  |j,                         |j.                  j                  j                  |j,                         y y )Ninitializer_ranger  )rj   stdr
  )r4  )r  r%   r3  text_configr   r0   rX   ry   datanormal_rW   zero_	Embeddingpadding_idx	LayerNormfill_rv   r$   r4   r5   Llama4VisionModelclass_embeddingscalepositional_embedding_vlm)r9   r   r4  s      r;   _init_weightsz#Llama4PreTrainedModel._init_weights  s    t{{$78 KK))((:: 	
 fbii(MM&&CS&9{{&  &&( '-MM&&CS&9!!-""6#5#56<<> .-MM$$S)KK""$ 12MM$$S) 12$$,,#3,?!!))s)< 12""''//FLL/A++0088V\\8J 3r<   N)rL   rM   rN   r!   __annotations__supports_gradient_checkpointing_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendrA  r)  r<   r;   r0  r0    s:    &*##4"5 N!"&Kr<   r0  c                   .    e Zd ZU dgZdZeed<   eee	dZ
def fdZee	 	 	 	 	 	 	 ddej                  deej"                     deej                     d	ee   d
eej&                     dee   deej                     dee   deeef   fd              Z xZS )Llama4TextModelr  modelr%   )
attentionsr=   r   c           	         t         |   |       |j                  | _        |j                  | _        t        j                  |j                  |j                  | j                        | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        t        |j                  |j                        | _        t#        |      | _        d| _        | j)                          y c c}w )Nrr   )r%   F)r)   r*   pad_token_idr:  
vocab_sizer0   r9  r.   embed_tokens
ModuleListrangenum_hidden_layersr  layersrv   r  normr   
rotary_embgradient_checkpointing	post_initr  s      r;   r*   zLlama4TextModel.__init__  s     !.. ++LL):):F<N<NPTP`P`ammHMfNfNfHgh9#FI6h
 &f&8&8f>Q>QR	36B&+# 	 is   D	input_idsr   r   r1  inputs_embedsr'  r  r   r>   c                    |d u |d uz  rt        d      |>| j                  |j                  | j                  j                  j                              }|r|
t               }|F||j                         nd}	t        j                  |	|	|j                  d   z   |j                        }||j                  d      }t        |x}
t              s*| j                  |||||d}t        d	i |t        d	i |d}
|}| j!                  ||      }| j"                  d | j                  j$                   D ]  } ||f|
|j&                     |||||d|}! | j)                  |      }t+        ||r|      S d       S )
N:You must specify exactly one of input_ids or inputs_embedsr   r    )r   )r%   input_embedsr   r  r1  r   )full_attentionchunked_attention)r   r   r  r'  r  r  )last_hidden_stater1  r)  )
ValueErrorrQ  r   ry   r   r   get_seq_lengthr2   arangerD   	unsqueezer   dictr%   r   r   rW  rU  rT  r  rV  r   )r9   rZ  r   r   r1  r[  r'  r  r   past_seen_tokenscausal_mask_mappingmask_kwargsr=   freq_cisdecoder_layers                  r;   rK   zLlama4TextModel.forward  s    -t";<YZZ  --ill4;L;L;S;S;Z;Z.[\M0*nO!CRC^==?de"\\ "2]5H5H5K"KTaThThN )33A6L ?-F ++ -"0"0#2 ,K #5"C{"C%?%N+%N#
 & ??=,?![[)H4;;+H+HI 
	M)	2=3O3OP).#-$,	 	M
	 		-0&+/8O
 	
>B
 	
r<   )NNNNNNN)rL   rM   rN   _no_split_modulesbase_model_prefixr"   rB  r   r  r   _can_record_outputsr*   r   r   r2   r  r   rO   r
   r.  r-  r   r   r   r~   r   rK   rP   rQ   s   @r;   rK  rK    s   12)/&/    '+1537+/59$(59C
##C
 !.C
 u//0	C

 "%C
   1 12C
 D>C
 !!1!12C
 +,C
 
u--	.C
  C
r<   rK  c                       e Zd ZU dgZdZdgZddiZeed<   def fdZ	d Z
d	 Zee	 	 	 	 	 	 	 	 	 dd
ej                  deej"                     deej                     deeeeej*                     f      deej*                     deej                     dee   deej                     deeej"                  f   dee   deeef   fd              Z xZS )Llama4ForCausalLMr  language_modelzlm_head.weightlm_headcolwise_repr%   c                     t         |   |       t        |      | _        |j                  | _        t        j                  |j                  |j                  d      | _        | j                          y rU   )
r)   r*   rK  rL  rP  r0   rX   r.   rr  rY  r8   s     r;   r*   zLlama4ForCausalLM.__init__7  sU     $V,
 ++yy!3!3V5F5FUS 	r<   c                     || _         y r]   rL  r9   decoders     r;   set_decoderzLlama4ForCausalLM.set_decoder@  s	    
r<   c                     | j                   S r]   rv  rs   s    r;   get_decoderzLlama4ForCausalLM.get_decoderC  s    zzr<   rZ  r   r   r1  r[  labelsr'  r  logits_to_keepr   r>   c
                 l    | j                   d|||||||d|
}|d   }t        |	t              rt        |	 d      n|	}| j	                  |dd|ddf         }d}|* | j
                  d||| j                  j                  d|
}t        |||j                  |j                  |j                        S )az  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, Llama4ForCausalLM

        >>> model = Llama4ForCausalLM.from_pretrained("meta-llama4/Llama4-2-7b-hf")
        >>> tokenizer = AutoTokenizer.from_pretrained("meta-llama4/Llama4-2-7b-hf")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```)rZ  r   r   r1  r[  r'  r  r   N)logitsr|  rP  )lossr  r1  r=   rM  r)  )rL  r   intslicerr  loss_functionr%   rP  r   r1  r=   rM  )r9   rZ  r   r   r1  r[  r|  r'  r  r}  r   outputsr=   slice_indicesr  r  s                   r;   rK   zLlama4ForCausalLM.forwardF  s    J $** 	
)%+')	
 	
  
8B>SV8W~ot4]kmA}a,?@A%4%%pVFt{{OeOepiopD%#33!//))
 	
r<   )	NNNNNNNNr   )rL   rM   rN   rl  rm  _tied_weights_keys_tp_planr"   rB  r*   ry  r{  r   r   r2   r  r   rO   r   r
   listr.  r-  r  r   r   r~   r   rK   rP   rQ   s   @r;   rp  rp  0  s_   12(*+=)H/   '+1537KO59-1$(5934<
##<
 !.<
 u//0	<

 "%tE4E4E/F(F"GH<
   1 12<
 ))*<
 D><
 !!1!12<
 c5<</0<
 +,<
 
u,,	-<
  <
r<   rp  zQ
    Base class for Llava causal language model (or autoregressive) outputs.
    )custom_introc                      e Zd ZU dZdZeej                     ed<   dZ	ej                  ed<   dZ
eeej                        ed<   dZeeej                        ed<   dZeeej                        ed<   dZeej                     ed<   y)	Llama4CausalLMOutputWithPasta\  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size (batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nr  r  r1  r=   rM  image_hidden_states)rL   rM   rN   r  r  r   r2   r.  rB  r  r1  r  r=   r~   rM  r  r)  r<   r;   r  r    s      )-D(5$$
%, $FE$9=OXd5#4#456=8<M8E%"3"345<59Ju001297;%"3"34;r<   r  c                   $     e Zd Z fdZd Z xZS )Llama4VisionMLP2c                 ~   t         |           |j                  | _        |j                  | _        t	        j
                  | j                  |j                  d      | _        t	        j
                  |j                  |j                  d      | _	        t	        j                         | _        |j                  | _        y rU   )r)   r*   r.   r-   r0   rX   projector_input_dimfc1projector_output_dimfc2GELUr[   projector_dropoutr   r8   s     r;   r*   zLlama4VisionMLP2.__init__  s    !--!'!9!999T33V5O5OV[\99V88&:U:U\abWWY//r<   c                     | j                  |      }| j                  |      }t        j                  || j                  | j                        }| j                  | j                  |            S )Nr   )r  r[   Fr   r   r  r9   r=   s     r;   rK   zLlama4VisionMLP2.forward  sT    /**=9		-4<<$--X!!$((="9::r<   r_   rQ   s   @r;   r  r    s    0;r<   r  c                   $     e Zd Z fdZd Z xZS )Llama4MultiModalProjectorc                     t         |           t        j                  |j                  j
                  |j                  j                  d      | _        y rU   )	r)   r*   r0   rX   vision_configvision_output_dimr5  r.   linear_1r8   s     r;   r*   z"Llama4MultiModalProjector.__init__  s?    		  22**
r<   c                 (    | j                  |      }|S r]   )r  )r9   image_featuresr=   s      r;   rK   z!Llama4MultiModalProjector.forward  s    n5r<   r_   rQ   s   @r;   r  r    s    
r<   r  c           
      J   | j                   \  }}}t        t        j                  |            }| j	                  |||d      } | j                         \  }}}}| j	                  ||t        ||z        t        ||z              }|j                  dddd      j                         }|j	                  |t        ||z        t        ||z        t        ||dz  z              }|j                  dddd      j                         }|j	                  |d|j                   d         }	|	S )Nr@   r   r(   r    r   )rD   r  mathsqrtrC   sizepermuter   )
input_tensorshuffle_ratio
batch_sizenum_patcheschannels
patch_sizeheightwidthreshaped_tensoroutput_tensors
             r;   pixel_shuffler    s%   (4(:(:%JXTYY{+,J$$ZZLL*6*;*;*='Jx"''
FC@U<VX[\dgt\tXuvO%--aAq9DDFO%**C./U]5J1KSQY]jlm]mQnMoO &--aAq9DDFO#((R9N9Nr9RSMr<   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )Llama4VisionPixelShuffleMLPc                     t         |           |j                  | _        t        |j                  | j                  dz  z        | _        |j                  | _        t        |      | _	        y r'   )
r)   r*   pixel_shuffle_ratior  r  	inner_dimr  
output_dimr  mlpr8   s     r;   r*   z$Llama4VisionPixelShuffleMLP.__init__  sX    #)#=#= V77D<T<TVW<WXY 55#F+r<   encoded_patchesr>   c                 P    t        || j                        }| j                  |      S r]   )r  r  r  )r9   r  s     r;   rK   z#Llama4VisionPixelShuffleMLP.forward  s#    '9Q9QRxx((r<   rL   rM   rN   r*   r2   rO   rK   rP   rQ   s   @r;   r  r    s#    ,)u|| ) )r<   r  freqs_cic                     |j                   }t        |j                        D cg c]  \  }}|dk(  s||dz
  k(  r|nd }}} | j                  | S c c}}w )Nr    )ndim	enumeraterD   rC   )r  r   r  idrD   s         r;   reshape_for_broadcastr    sW    ::D=Fu{{=STTQ!q&AMQq0TET8==%   Us   Ac                 B   t        j                   | j                         j                  g | j                  d d dd       }t        j                   |j                         j                  g |j                  d d dd       }t        ||      }|j                  |j                        }t        j                  ||z        j                  d      }t        j                  ||z        j                  d      }|j                  |       |j                  |      fS )Nr@   r(   )r  r   r   )r2   r   ro   r   rD   r  r   r   r   r   rp   )r   r   r  query_key_	query_outkey_outs          r;   vision_apply_rotary_embr    s    
 ""#85;;=#8#8#R%++cr:J#RB#RPQ#RSF  !4!4!4!Lciin!Lb!L!!LMD$hfEH{{6==)H""6H#45==a@I  199!<GU#W__S%999r<   c                        e Zd Zdef fdZ	 	 d
dej                  dej                  deej                     dee   de	e
   deej                  eej                     eeej                        f   fd	Z xZS )Llama4VisionAttentionr%   c                    t         |           || _        |j                  | _        |j
                  | _        |j                  |j
                  z  | _        d| _        |j                  | _	        | j                  dz  | _
        t        j                  | j                  | j                  | j                  z  d      | _        t        j                  | j                  | j                  | j                  z  d      | _        t        j                  | j                  | j                  | j                  z  d      | _        t        j                  | j                  | j                  z  | j                  d      | _        y )Nr    r   TrV   )r)   r*   r%   r.   	embed_dimr   	num_headsr   r   r   r   r0   rX   r   r   r   r  r8   s     r;   r*   zLlama4VisionAttention.__init__  s   ++33**f.H.HH$%!!'!9!9}}d*ii0NUYZii0NUYZii0NUYZii >UYZr<   r=   r  r   r  r   r>   c                    |j                   d d }g |d| j                  }| j                  |      j                  |      }| j	                  |      j                  |      }	| j                  |      j                  |      }
t        ||	|      \  }}	|j                  dd      }|	j                  dd      }	|
j                  dd      }
t        }| j                  j                  dvrt        | j                  j                     } || ||	|
d f| j                  sdn| j                  d dd|\  }} |j                  g |d j                         }| j!                  |      }||fS )	Nr@   )r  r    r(   )r  flex_attentionr  F)r   r   r   )rD   r   r   rC   r   r   r  r   r   r%   r  r   r   r   r   r   r  )r9   r=   r  r   r  r   r  r  r  r   r   r  r   r   s                 r;   rK   zLlama4VisionAttention.forward  su    $))#2.88b8$--8{{=166|D[[/44\B
{{=166|D#:<^f#g j#--a3))!Q/
#--a3(F;;++3NN"9$++:Z:Z"[$7
%
  $}}C$2H2H
%
 
%
!\ *k));;;;FFHkk+.L((r<   r  )rL   rM   rN   r   r*   r2   rO   r   r
   r   r   r~   rK   rP   rQ   s   @r;   r  r    s    [1 [& 26*.()||() ,,() !.	()
 !() -.() 
u||Xell3XeELL>Q5RR	S()r<   r  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )Llama4VisionMLPc                 &   t         |           || _        t        j                         | _        t        j                  |j                  |j                  d      | _	        t        j                  |j                  |j                  d      | _
        y )NTrV   )r)   r*   r%   r0   r  r[   rX   r.   r-   r  r  r8   s     r;   r*   zLlama4VisionMLP.__init__7  se    WWY99V//1I1IPTU99V55v7I7IPTUr<   r=   r>   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r]   )r  r[   r  r  s     r;   rK   zLlama4VisionMLP.forward>  s4    /**=9/r<   r  rQ   s   @r;   r  r  6  s$    VU\\ ell r<   r  c            
            e Zd Zdef fdZ	 	 ddej                  dej                  deej                     dee   fdZ	 xZ
S )	Llama4VisionEncoderLayerr%   c                    t         |           |j                  | _        t        |      | _        t        |      | _        t        j                  |j                        | _	        t        j                  |j                        | _
        y r]   )r)   r*   r.   r  r   r  r  r0   r;  r%  r&  r8   s     r;   r*   z!Llama4VisionEncoderLayer.__init__F  sb    !--.v6"6*!||F,>,>?(*V5G5G(H%r<   hidden_stater  r   output_attentionsc                     |}| j                  |      }| j                  |||      \  }}||z   }|}| j                  |      }| j                  |      }||z   }|f}|r||fz  }|S )N)r  r   )r%  r   r&  r  )r9   r  r  r   r  r*  r   r  s           r;   rK   z Llama4VisionEncoderLayer.forwardP  s      ++L9%)^^) &4 &
"l
  ,.  44\Bxx-,./&Gr<   r  )rL   rM   rN   r   r*   r2   rO   r   r-  rK   rP   rQ   s   @r;   r  r  E  sZ    I1 I 26,0ll ,, !.	
 $D>r<   r  c                        e Zd ZdZdef fdZ	 	 	 	 ddej                  dej                  deej                     dee	   dee	   d	ee	   d
e
eef   fdZ xZS )Llama4VisionEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`Llama4VisionEncoderLayer`].

    Args:
        config: Llama4VisionConfig
    r%   c                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        || _        y c c}w )NF)
r)   r*   r%   r0   rR  rS  rT  r  rU  rX  )r9   r%   r,  r:   s      r;   r*   zLlama4VisionEncoder.__init__z  sW    mmuU[UmUmOn$o!%=f%E$op&+# %ps   A*r=   r  r   r  output_hidden_statesreturn_dictr>   c                 z   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|rdnd}|rdnd}| j                  D ]&  }	|r||fz   } |	||||      }
|r	||
d   fz   }|
d   }( |r||fz   }|st        d |||fD              S t        |||      S )ad  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr)  )r  r   r  r  r    r   c              3   &   K   | ]	  }||  y wr]   r)  .0vs     r;   	<genexpr>z.Llama4VisionEncoder.forward.<locals>.<genexpr>  s     eqWXWde   ra  r=   rM  )r%   r  r  use_return_dictrU  r~   r   )r9   r=   r  r   r  r  r  encoder_statesall_attentionsencoder_layerlayer_outputss              r;   rK   zLlama4VisionEncoder.forward  s    > 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]30d![[ 	-M#!/=2B!B)*-"3!	M !!/=3C2E!E)!,M	-   +}.>>Ne]NN$Seee+>Vd
 	
r<   NNNN)rL   rM   rN   r  r   r*   r2   rO   r   r-  r   r~   r   rK   rP   rQ   s   @r;   r  r  q  s    1  26,0/3&*?
||?
 ,,?
 !.	?

 $D>?
 'tn?
 d^?
 
uo%	&?
r<   r  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )Llama4UnfoldConvolutionc                 <   t         |           |j                  }t        |t              r||f}t
        j                  j                  ||j                        | _        t        j                  |j                  |d   z  |d   z  |j                  d      | _        y )N)kernel_sizestrider   r    FrV   )r)   r*   r  r   r  r2   r0   UnfoldunfoldrX   num_channelsr.   linear)r9   r%   r  r:   s      r;   r*   z Llama4UnfoldConvolution.__init__  s    ''k3'&4Khhoo+fFWFWoXii+a.0;q>A
r<   r=   r>   c                 p    | j                  |      }|j                  ddd      }| j                  |      }|S )Nr   r(   r    )r  r  r  r  s     r;   rK   zLlama4UnfoldConvolution.forward  s8    M2%--aA6M2r<   r  rQ   s   @r;   r  r    s#    

U\\ ell r<   r  c                   $     e Zd Z fdZd Z xZS )Llama4VisionRotaryEmbeddingc                    t         |           |j                  |j                  z  }t	        j
                  |dz  t        j                        j                  |dz  d      }t	        j                  ||d d gd      }d|d<   ||z  }||z  }|j                  |j                  z  dz  }d|j                  t	        j
                  d|d      d |dz   j                         |z  z  z  }|dz   d	   |d d d d f   z  j                  dd
      }|dz   d	   |d d d d f   z  j                  dd
      }	t	        j                  ||	gd
      j                         j                         dd d df   }
|
j                  |j                  d
dd      dk  d      }
t	        j                   t	        j"                  t	        j$                  |
      t	        j&                  |
      gd
            }|| _        y )Nr(   )r   r    r   rA   r   )r@   r@   r
  ).Nr@   .)r)   r*   
image_sizer  r2   rd  int32r   catr.   r   
rope_thetaro   repeat_interleaver   masked_fillr   stackcossinr  )r9   r%   idximg_idxfrequencies_xfrequencies_yfreq_dim	rope_freqfreqs_xfreqs_yr   rj  r:   s               r;   r*   z$Llama4VisionRotaryEmbedding.__init__  s   6#4#44,,sAvU[[9AA#q&!L))Wgbqk2:#3%%)C)CCqH6,,a11MN_QY]^Q^1`1f1f1hks1stu	!A%y1IdD!m4LL__`agi_j!A%y1IdD!m4LL__`agi_j		7G,"5;;=HHJ3PSRSPS8T!!'//"a";a"?C((eii6F		RWHX5Y_a)bc r<   c                 L    | j                   j                  |j                        S r]   )r  r   r   r  s     r;   rK   z#Llama4VisionRotaryEmbedding.forward  s    }} 4 455r<   r_   rQ   s   @r;   r  r    s    !"6r<   r  c                        e Zd ZU dZdgZeed<   def fdZd Z	 	 	 	 dde	j                  dee	j                     dee   d	ee   d
ee   deeee	j                  df   f   fdZ xZS )r=  vision_modelr  r%   c                 r   t         |   |       |j                  | _        |j                  | _        |j                  | _        |j
                  | _        | j                  | j                  z  dz  dz   | _        |j                  dz  | _        t        |      | _	        t        j                  | j                  t        j                  | j                        z        | _        t        j                  | j                  t        j                  | j                  | j                        z        | _        t!        |      | _        t        j$                  | j                        | _        t        j$                  | j                        | _        t+        |      | _        t/        |      | _        | j3                          y )Nr(   r    r   )r)   r*   r  r  r.   r  r  r?  r  patch_embeddingr0   r1   r2   randnr>  r@  r  rotary_embeddingr;  layernorm_prelayernorm_postr  rL  r  vision_adapterrY  r8   s     r;   r*   zLlama4VisionModel.__init__  sA     ++ ++!--"// OOt>1DqH''-
6v>!||DJJTEUEU9V,VW(*TZZ%++dN^N^`d`p`pBq5q(r% ;F C  \\$*:*:; ll4+;+;< )0
9&Ar<   c                     | j                   S )zg
        This function is used to fetch the first embedding layer to activate grads on inputs.
        )r  rs   s    r;   get_input_embeddingsz&Llama4VisionModel.get_input_embeddings  s     ###r<   pixel_valuesr   r  r  r  r>   .c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|j                  \  }}}}	d}
d}| j                  |      }|j                  \  }}}|j                  ||
z  |z  ||      }| j                  j                  |j                  d   d|j                  d         }t        j                  ||gd      }|dz  }|j                  ||
z  |||      }| j                  j                  |j                  |j                        }||z   }| j                  |      }|j!                  |d|      }| j#                  |      }| j%                  |d|||      }|j&                  }| j)                  |      }|ddddddf   }| j+                  |      }|r|j,                  nd}|r|d   }nd}|st/        d	 |||fD              S t1        |||
      S )a  

        Example:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, MllamaVisionModel

        >>> checkpoint = "meta-llama/Llama-3.2-11B-Vision"
        >>> model = MllamaVisionModel.from_pretrained(checkpoint)
        >>> processor = AutoProcessor.from_pretrained(checkpoint)

        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> inputs = processor(images=image, return_tensors="pt")

        >>> output = model(**inputs)

        >>> print(output.last_hidden_state.shape)
        torch.Size([1, 1, 4, 1025, 7680])
        ```
        Nr    r   r@   rA   r   r   )r   r  r  r  r(   c              3   &   K   | ]	  }||  y wr]   r)  r  s     r;   r  z,Llama4VisionModel.forward.<locals>.<genexpr>j  s     _qQRQ^_r  r  )r%   r  r  r  rD   r  r   r>  r   r2   r  r@  r   r   r   r  rC   r  rL  ra  r  r  r=   r~   r   )r9   r  r   r  r  r  batch_size_times_num_tilesr  r  r  num_concurrent_media
num_chunksr  r,  r  r   r>  positional_embeddingr  r|   r=   rM  s                         r;   rK   zLlama4VisionModel.forward  sS   > 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] COBTBT?"L&% 
++L9%1%7%7";
 $++&)==
JKYc
 ..55l6H6H6KQP\PbPbcePfgyy,!@aHq $++&)==z;Xb
  $<<??lFXFXamatat?u#&::)),7#(()CRT((6!5/  
 //**<8#AssAI. **<80D,,$JJ_\=*$M___*'!
 	
r<   r  )rL   rM   rN   rm  rl  r   rB  r*   r  r2   rO   r   r-  r   r   r~   rK   rP   rQ   s   @r;   r=  r=    s    &341 2$ 26,0/3&*_
ll_
 !._
 $D>	_

 'tn_
 d^_
 
ellC&7 88	9_
r<   r=  c            (           e Zd ZU ddgZi ZdZeed<   def fdZd Z	d Z
d Zd	 Zd
 Zd Zdej                   deeee   f   defdZdej,                  dej                   dej                   fdZe	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d#dej,                  dej                   deej4                     deej,                     dee   deej                      deeeee   f      dee   deej,                     dee   dee   dee   dee   deej,                     deeej4                  f   dej4                  dee   d eee f   f$d!       Z!	 	 	 	 	 	 d$d"Z" xZ#S )%Llama4ForConditionalGenerationr  r   r%   c                 h   t         |   |       t        |j                        | _        t        |      | _        t        |j                        | _	        |j                  j                  | _
        | j                  j                  | j                  j                  nd| _        | j                          y )Nr@   )r)   r*   r=  r  r  r  multi_modal_projectorrp  r5  rq  rP  r%   rO  rY  r8   s     r;   r*   z'Llama4ForConditionalGeneration.__init__y  s     -f.B.BC%>v%F"/0B0BC ,,778<8P8P8\DKK44bdr<   c                 6    | j                   j                         S r]   )rq  r  rs   s    r;   r  z3Llama4ForConditionalGeneration.get_input_embeddings  s    ""7799r<   c                 :    | j                   j                  |       y r]   )rq  set_input_embeddings)r9   r   s     r;   r%  z3Llama4ForConditionalGeneration.set_input_embeddings  s    007r<   c                 6    | j                   j                         S r]   )rq  get_output_embeddingsrs   s    r;   r'  z4Llama4ForConditionalGeneration.get_output_embeddings  s    ""88::r<   c                 :    | j                   j                  |       y r]   )rq  set_output_embeddings)r9   new_embeddingss     r;   r)  z4Llama4ForConditionalGeneration.set_output_embeddings  s    11.Ar<   c                 :    | j                   j                  |       y r]   )rq  ry  rw  s     r;   ry  z*Llama4ForConditionalGeneration.set_decoder  s    ''0r<   c                 6    | j                   j                         S r]   )rq  r{  rs   s    r;   r{  z*Llama4ForConditionalGeneration.get_decoder  s    ""..00r<   r  vision_feature_layervision_feature_select_strategyc                     |dvrt        d| j                         |j                         D ci c]  \  }}|	|| }}} | j                  |fddi|}|j                  }|S c c}}w )a  
        Obtains image last hidden states from the vision tower and apply al projection.

        Args:
            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
               The tensors corresponding to the input images.
            vision_feature_layer (`Union[int, list[int]]`):
                The index of the layer to select the vision feature. If multiple indices are provided,
                the vision feature of the corresponding indices will be concatenated to form the
                vision features.
            vision_feature_select_strategy (`str`):
                The feature selection strategy used to select the vision feature from the vision backbone.
                Can be one of `"default"` or `"full"`
        Returns:
            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
        )r   fullz$Unexpected select feature strategy: r  F)rb  r.  itemsr  ra  )	r9   r  r-  r.  r   kr  image_outputsr  s	            r;   get_image_featuresz1Llama4ForConditionalGeneration.get_image_features  s    . *1DDCDDgDgChijj#)<<>C41aQ]!Q$CC))),]U]V\]$66 Ds
   
A&A&rZ  r[  r  c                 ,   |m| | j                         t        j                  | j                  j                  t        j
                  |j                              k(  }|j                  d      }n|| j                  j                  k(  }|j                         }|j                  d      j                  |      j                  |j                        }||   j                         |j                         k7  rt        d| d|j                  d          |S )z
        Obtains multimodal placeholdr mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        r  r@   z6Image features and image tokens do not match: tokens: z, features r   )r  r2   tensorr%   image_token_idlongr   allr   re  	expand_asr   numelrb  rD   )r9   rZ  r[  r  special_image_maskn_image_tokenss         r;   get_placeholder_maskz3Llama4ForConditionalGeneration.get_placeholder_mask  s    !.2M$2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!*dkk.H.H!H+//1/99"=GGVYYZgZnZno+,2248L8L8NNHHXXcdrdxdxyzd{c|}  "!r<   r   r   r1  r|  r'  r  r  r  r  r}  image_sizesr   r>   c                 *   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }||n| j                   j                  j
                  }||n| j                   j                  j                  }|du |duz  rt        d      ||t        d      | | j                         |      }|| j                  ||||      }|j                  d|j                  d            }| j                  |      j                  |j                  |j                        }| j!                  |||      }|j#                  ||      } | j$                  d|||||
|||||d
|}|d   }d}|	<||dd|j&                  d	   d	z
   df   j                  |j                        }|d
ddddf   |j                  |j                        dk7     j)                         }|	d
d	df   |j                  |	j                        dk7     j)                         }n1|d
ddddf   j)                         }|	d
d	df   j)                         }t+        j,                         } ||j                  d|j                  d            |j                  d      j                  |j                              }|s|f|d	d z   }||f|z   S |S t/        |||j0                  |j2                  |j4                  |      S d      S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, LlavaForConditionalGeneration

        >>> model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf")
        >>> processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")

        >>> prompt = "USER: <image>\nWhat's the content of the image? ASSISTANT:"
        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, text=prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(**inputs, max_new_tokens=15)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "USER:  \nWhat's the content of the image? ASSISTANT: The image features a busy city street with a stop sign prominently displayed"
        ```Nr]  zdYou cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one)r  r-  r.  r?  r@   )r[  r  )
r   r   r1  r[  r'  r  r  r  r  r}  r   r    .)r  r  r1  r=   rM  r  r)  )r%   r  r  r  r  r-  r.  rb  r  r4  rC   r  r"  r   r   r   r>  masked_scatterrq  rD   r   r0   CrossEntropyLossr  r1  r=   rM  )r9   rZ  r  r   r   r1  r[  r-  r.  r|  r'  r  r  r  r  r}  r?  r   r  vision_flatprojected_vision_flatr<  r  r  r  shift_attention_maskshift_logitsshift_labelsloss_fctr|   s                                 r;   rK   z&Llama4ForConditionalGeneration.forward  s   b 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] $/ !**?? 	 .9 +**II 	' -t";<YZZ#(Av   7D557	BM#!44)%9/M'	 5 N )--b.2E2Eb2IJK$($>$>{$K$N$N$$m&9&9%! "&!:!:G\ "; " *889KMbcM%$%% 
)%+'/!5#))
 
 ) (6a6<<?Q;N9O9Q6Q'R'U'UV\VcVc'd$%c3B3k23G3J3J6==3Y]^3^_jjl%c12g/C/F/Fv}}/UYZ/Z[ffh%c3B3k2==?%c12g99;**,H!!"l&7&7&;<l>O>OPR>S>V>VWcWjWj>kD Y,F'+'7D7V#CVC+#33!//))2>2J
 	
 QU
 	
r<   c           	      f     | j                   j                  |f|||||d|}	|d   dk(  r||	d<   |	S )N)r1  r[  r   r  r}  r   r  )rq  prepare_inputs_for_generation)
r9   rZ  r1  r[  r  r   r  r}  r   model_inputss
             r;   rJ  z<Llama4ForConditionalGeneration.prepare_inputs_for_generationW  s_     It**HH
+')))
 
 !! ,8L(r<   )NNNNNNNNNNNNNNr   N)NNNNNN)$rL   rM   rN   rl  r  rm  r!   rB  r*   r  r%  r'  r)  ry  r{  r2   r.  r   r  r  r   r4  r  r>  r   r   rO   r
   r-  r   r   r~   r  rK   rJ  rP   rQ   s   @r;   r  r  s  sb   13MNH	| 	:8;B11'' $CcN3 ),	<"))":?:K:K"]b]n]n".  '+*.1537+/59@D8<-1$(,0/3&*5934$(#I
##I
 ''I
 !.	I

 u//0I
 "%I
   1 12I
 'uS$s)^'<=I
 )1I
 ))*I
 D>I
 $D>I
 'tnI
 d^I
 !!1!12I
  c5<</0!I
" \\#I
$ +,%I
& 
u22	3'I
 I
\ r<   r  )r0  rK  r=  rp  r  )r  )_r  dataclassesr   typingr   r   r   r2   torch.nnr0   torch.nn.functionalr   r  /transformers.models.llama4.configuration_llama4r   activationsr	   cache_utilsr
   r   
generationr   integrationsr   masking_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.genericr   configuration_llama4r!   r"   
get_loggerrL   loggerModuler$   rS   ra   rv   rX   r   r   r   rO   r~   r   r  r   ro   r   r   r   r  r0  rK  rp  r  r  r  r  r  r  r  r  r  r  r  r  r  r=  r  __all__r)  r<   r;   <module>rc     s     ! , ,     N ! . ) 7 K B 9 m m K F & R R / @ 
		H	%		 B)BII )$!uxx !=		 =(,299 , _-"BII " ."*		 >	2	2	2 ||	2 5<<%&		2	UU\\ 	U# 	U%,, 	U( %II%<<% 
% <<	%
 U\\*% % %D %II%<<% 
% <<	%
 U\\*% % %4Z)")) Z)z27 2j #KO #K #KL _
+ _
 _
DT
- T
n 
<; < <2;uxx ;"		 (
)")) 
)!ELL ! !:<<:	: ll: 5<<%&	:8)BII 8)vbii )9 )XO
")) O
dbii (6")) 6,C
- C
L@%:O @Fr<   